In [1]:
!pwd

/Users/thomasgiannetti/code/contatc2/green_mood_tracker/green_mood_tracker


In [2]:
from training_data import get_raw_data

In [3]:
df = get_raw_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  sentiment140_final['source'] = 'sentiment140'


In [4]:
df = df[0:5000]

In [5]:
df['polarity'].value_counts()

0    2245
4    1588
1    1167
Name: polarity, dtype: int64

In [6]:
from data_cleaning import clean

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomasgiannetti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
df = clean(df, 'text')

In [8]:
df.shape

(5000, 4)

In [9]:
df.head()

Unnamed: 0,id,text,polarity,source
0,1467933112,angel going miss athlete weekend,0,sts_gold
1,2323395086,look though shaq getting traded cleveland play...,0,sts_gold
2,1467968979,april th isnt coming soon enough,0,sts_gold
3,1990283756,drinking mcdonalds coffee understanding someon...,0,sts_gold
4,1988884918,dissapointed taylor swift doesnt twitter,0,sts_gold


In [10]:
df['text'].isna().value_counts()

False    5000
Name: text, dtype: int64

### Bag of Words Modelling

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['text'])

X_bow = X.toarray()
X_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
from sklearn.naive_bayes import MultinomialNB

y = df['polarity']
nb_model = MultinomialNB()
nb_model.fit(X_bow,y)
nb_model.score(X_bow,y)

0.8706

In [14]:
from sklearn.model_selection import cross_validate

modelcv= cross_validate(MultinomialNB(),X_bow, y,cv=5, scoring='accuracy')
modelcv['test_score'].mean()

0.6423871157871157

### N-Gram Modelling / 2-gram

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

X_1 = tf_idf_vectorizer.fit_transform(df['text'])

X_bow1 = X_1.toarray()

In [17]:
ngram_model = MultinomialNB()
ngram_model.fit(X_bow1,y)
ngram_model.score(X_bow1,y)

0.9662

In [18]:
ngram_cv = cross_validate(MultinomialNB(),X_bow1, y, cv=5, scoring='accuracy')
ngram_cv['test_score'].mean()

0.5030072016072016

### N-Gram Modelling / 3-gram

In [19]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (3,3))

X_2 = tf_idf_vectorizer.fit_transform(df['text'])

X_bow2 = X_2.toarray()

In [20]:
ngram_model = MultinomialNB()
ngram_model.fit(X_bow2,y)
ngram_model.score(X_bow2,y)

0.9466

In [21]:
ngram_cv = cross_validate(MultinomialNB(),X_bow2, y, cv=5, scoring='accuracy')
ngram_cv['test_score'].mean()

0.4636045694045694

### Pipeline - TfidfVectorizer

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

parameters = {
    'tfidf__ngram_range': ((1,1), (2,2), (3,3)),
    'tfidf__max_features': (1000, 2000, 2500),
    'nb__alpha': (0.05, 0.1, 0.5, 1),}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(df['text'],y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   44.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'tfidf__ngram_range': ((1, 1), (2, 2), (3, 3)), 'tfidf__max_features': (1000, 2000, 2500), 'nb__alpha': (0.05, 0.1, 0.5, 1)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [23]:
grid_search.best_params_

{'nb__alpha': 0.5, 'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1)}

In [24]:
grid_search.best_score_

0.6362

### Pipeline - CountVectorizer

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('ctv', CountVectorizer()),
    ('nb', MultinomialNB()),
])

parameters = {
    'ctv__ngram_range': ((1,1), (2,2), (3,3)),
    'ctv__max_features': (1000, 2000, 2500),
    'nb__alpha': (0.05, 0.1, 0.5, 1, 2),}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(df['text'],y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:   36.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('ctv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'ctv__ngram_range': ((1, 1), (2, 2), (3, 3)), 'ctv__max_features': (1000, 2000, 2500), 'nb__alpha': (0.05, 0.1, 0.5, 1, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [26]:
grid_search.best_params_

{'ctv__max_features': 2000, 'ctv__ngram_range': (1, 1), 'nb__alpha': 2}