In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import re
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB

In [3]:
from preproc_text import process_tweets
from preproc_abbv import convert_abbrev_in_text

In [4]:
from sklearn.model_selection import train_test_split

### This has been read in without the encoding='latin'

In [5]:
df = pd.read_csv('../raw_data/kaggle.csv', names = ['sentiment','2','3','4','5','tweet'])

In [6]:
df = df.sample(frac=0.05, random_state=0).copy()

In [7]:
df = df[['tweet','sentiment']]
df.head()

Unnamed: 0,tweet,sentiment
557138,wants to compete! i want hard competition! i w...,0
349381,It seems we are stuck on the ground in Amarill...,0
182051,where the f are my pinking shears? rarararrrar...,0
571236,0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER...,0
1339637,@ reply me pls,4


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'],df['sentiment'],test_size=0.2, stratify=df['sentiment'])

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25,stratify=y_train)

# Processing of tweets

In [10]:
from preproc_class import TextPreprocess
from convert_class import TextConvertAbbv

In [11]:
preprocess_pipe = Pipeline([('clean_text', TextPreprocess()),('convert_text',TextConvertAbbv())])

In [12]:
final_pipe = Pipeline([('preprocess',preprocess_pipe),('vectorizer', CountVectorizer()), ('model', MultinomialNB())])

## Baseline score on validation set with no parameter tuning:

In [16]:
%time
final_pipe.fit(X_train,y_train)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('clean_text', TextPreprocess()),
                                 ('convert_text', TextConvertAbbv())])),
                ('vectorizer', CountVectorizer()), ('model', MultinomialNB())])

## Score on trained set:

In [14]:
final_pipe.score(X_train,y_train)

0.8562291666666667

In [13]:
final_pipe.score(X_val,y_val)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


0.754

## Pipeline for parameter tuning

#### The things to try here are:
- Bag of Words
- Tf-Idf
- N-grams

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [18]:
final_pipe.get_params()

{'memory': None,
 'steps': [('preprocess',
   Pipeline(steps=[('clean_text', TextPreprocess()),
                   ('convert_text', TextConvertAbbv())])),
  ('vectorizer', CountVectorizer()),
  ('model', MultinomialNB())],
 'verbose': False,
 'preprocess': Pipeline(steps=[('clean_text', TextPreprocess()),
                 ('convert_text', TextConvertAbbv())]),
 'vectorizer': CountVectorizer(),
 'model': MultinomialNB(),
 'preprocess__memory': None,
 'preprocess__steps': [('clean_text', TextPreprocess()),
  ('convert_text', TextConvertAbbv())],
 'preprocess__verbose': False,
 'preprocess__clean_text': TextPreprocess(),
 'preprocess__convert_text': TextConvertAbbv(),
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.int64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 1.0,
 'vectorizer__max_features': None,
 'vectorizer__min_df': 1,
 

In [19]:
grid_search = GridSearchCV(
    final_pipe, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
#         'vectorizer': [CountVectorizer(), TfidfVectorizer()],
        'vectorizer__ngram_range': [(1,1),(1,2),(2,2)],
        'vectorizer__max_df':[0.6,0.7,0.8,1],
        'vectorizer__min_df':[1,3]},
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=True)

# Random sample of X_train to prevent it from taking all day...

In [20]:
X_samp = X_train.sample(5000, random_state=0)
y_samp = y_train.sample(5000, random_state=0)

In [21]:
grid_search.fit(X_samp,y_samp)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


 0.707  0.5442 0.7016 0.709  0.5782 0.7046 0.707  0.5442 0.5104 0.5336
 0.5326    nan    nan    nan]


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocess',
                                        Pipeline(steps=[('clean_text',
                                                         TextPreprocess()),
                                                        ('convert_text',
                                                         TextConvertAbbv())])),
                                       ('vectorizer', CountVectorizer()),
                                       ('model', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'vectorizer__max_df': [0.6, 0.7, 0.8, 1],
                         'vectorizer__min_df': [1, 3],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)]},
             scoring='accuracy', verbose=True)

In [24]:
grid_search.best_params_

{'vectorizer__max_df': 0.6,
 'vectorizer__min_df': 1,
 'vectorizer__ngram_range': (1, 2)}

In [25]:
best_NB = grid_search.best_estimator_

## Fit on all train set again before doing the score

In [30]:
best_NB.fit(X_train,y_train)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('clean_text', TextPreprocess()),
                                 ('convert_text', TextConvertAbbv())])),
                ('vectorizer', CountVectorizer(max_df=0.6, ngram_range=(1, 2))),
                ('model', MultinomialNB())])

In [31]:
best_NB.score(X_train,y_train)

0.963375

In [32]:
best_NB.score(X_val,y_val)

0.757625

In [33]:
best_NB.score(X_test,y_test)

0.75625