In [1]:
%load_ext autoreload
%autoreload 2

In [32]:
import pandas as pd
import re
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [3]:
from preproc_text import process_tweets
from preproc_abbv import convert_abbrev_in_text

In [4]:
from sklearn.model_selection import train_test_split

### This has been read in without the encoding='latin'

In [6]:
df = pd.read_csv('../raw_data/training.1600000.processed.noemoticon.csv', names = ['sentiment','2','3','4','5','tweet'])

In [7]:
df = df.sample(frac=0.05, random_state=0).copy()

In [8]:
df = df[['tweet','sentiment']]
df.head()

Unnamed: 0,tweet,sentiment
557138,wants to compete! i want hard competition! i w...,0
349381,It seems we are stuck on the ground in Amarill...,0
182051,where the f are my pinking shears? rarararrrar...,0
571236,0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER...,0
1339637,@ reply me pls,4


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'],df['sentiment'],test_size=0.2, stratify=df['sentiment'])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25,stratify=y_train)

# Processing of tweets

In [11]:
from preproc_class import TextPreprocess
from convert_class import TextConvertAbbv

In [12]:
preprocess_pipe = Pipeline([('clean_text', TextPreprocess()),('convert_text',TextConvertAbbv())])

In [16]:
final_pipe = Pipeline([('preprocess',preprocess_pipe),('vectorizer', CountVectorizer()), ('model', KNeighborsClassifier())])

## Baseline score on validation set with no parameter tuning:

In [16]:
%time
final_pipe.fit(X_train,y_train)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('clean_text', TextPreprocess()),
                                 ('convert_text', TextConvertAbbv())])),
                ('vectorizer', CountVectorizer()), ('model', MultinomialNB())])

## Score on trained set:

In [14]:
final_pipe.score(X_train,y_train)

0.8562291666666667

In [13]:
final_pipe.score(X_val,y_val)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


0.754

## Pipeline for parameter tuning

#### The things to try here are:
- Bag of Words
- Tf-Idf
- N-grams

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [19]:
final_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('clean_text', TextPreprocess()),
                   ('convert_text', TextConvertAbbv())])),
  ('vectorizer', CountVectorizer()),
  ('model', KNeighborsClassifier())],
 'verbose': False,
 'preprocess': Pipeline(steps=[('clean_text', TextPreprocess()),
                 ('convert_text', TextConvertAbbv())]),
 'vectorizer': CountVectorizer(),
 'model': KNeighborsClassifier(),
 'preprocess__memory': None,
 'preprocess__steps': [('clean_text', TextPreprocess()),
  ('convert_text', TextConvertAbbv())],
 'preprocess__verbose': False,
 'preprocess__clean_text': TextPreprocess(),
 'preprocess__convert_text': TextConvertAbbv(),
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.int64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 1.0,
 'vectorizer__max_features': None,
 'vectorizer_

In [23]:
grid_search = GridSearchCV(
    final_pipe, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
#         'vectorizer': [CountVectorizer(), TfidfVectorizer()],
        'vectorizer__ngram_range': [(1,2)],
        'vectorizer__max_df':[0.6],
        'vectorizer__min_df':[1],
        'model__n_neighbors': [1, 3, 5, 7, 9]},
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=True)

# Random sample of X_train to prevent it from taking all day...

In [24]:
X_samp = X_train.sample(5000, random_state=0)
y_samp = y_train.sample(5000, random_state=0)

In [25]:
grid_search.fit(X_samp,y_samp)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocess',
                                        Pipeline(steps=[('clean_text',
                                                         TextPreprocess()),
                                                        ('convert_text',
                                                         TextConvertAbbv())])),
                                       ('vectorizer', CountVectorizer()),
                                       ('model', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'model__n_neighbors': [1, 3, 5, 7, 9],
                         'vectorizer__max_df': [0.6], 'vectorizer__min_df': [1],
                         'vectorizer__ngram_range': [(1, 2)]},
             scoring='accuracy', verbose=True)

In [26]:
grid_search.best_params_

{'model__n_neighbors': 1,
 'vectorizer__max_df': 0.6,
 'vectorizer__min_df': 1,
 'vectorizer__ngram_range': (1, 2)}

In [27]:
best_NB = grid_search.best_estimator_

## Fit on all train set again before doing the score

In [28]:
best_NB.fit(X_train,y_train)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('clean_text', TextPreprocess()),
                                 ('convert_text', TextConvertAbbv())])),
                ('vectorizer', CountVectorizer(max_df=0.6, ngram_range=(1, 2))),
                ('model', KNeighborsClassifier(n_neighbors=1))])

In [29]:
best_NB.score(X_train,y_train)

0.9941666666666666

In [30]:
best_NB.score(X_val,y_val)

0.6330625

In [31]:
best_NB.score(X_test,y_test)

0.628875