# HW7: Comparing MNB & SVM with Kaggle Sentiment Data

## OVERVIEW

---
### VECTORIZERS USED:
    CountVectorizer
    TfidfVectorizer

### MODELS USED:
    Multinomial Naive Bayes (MNB)
    Support Vector Machines (SVM)
---

---
#### VECTORIZATION PARAMS:
    Binary
    Stopwords
    Unigrams, Bigrams
    Min & Max df
---

#### TODO:
    Stemming?
    Vadar + TextBlob

### FUNCTION & PACKAGE PARTY

In [1]:
## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df





### DATA GOES HERE:

In [2]:
# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

# TASK 1

## TEST 1 -- MNB & SVM with Vectorizer 1


In [3]:
big_df = []

In [4]:
vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │ -10.48 │ 102      │   -5.95 │ time       │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │ -10.48 │ 10th     │   -5.94 │ minutes    │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │ -10.48 │ 127      │   -5.93 │ characters │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │ -10.48 │ 13th     │   -5.93 │ story      │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │ -10.48 │ 14       │   -5.90 │ comedy     │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │ -10.48 │ 16       │   -5.70 │ just       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │ -10.48 │ 163      │   -5.20 │ like       │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │ -10.48 │ 168      │   -5.07 │ bad        │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │ -10.48 │ 170      │   -4.85 │ film       │


Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401


In [5]:
vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤═══════════╤═════════╤════════════════╕
│    │   Most │ Likely    │   Least │ Likely         │
╞════╪════════╪═══════════╪═════════╪════════════════╡
│  0 │  -1.84 │ hawke     │    1.63 │ cesspool       │
├────┼────────┼───────────┼─────────┼────────────────┤
│  1 │  -1.70 │ collar    │    1.66 │ pompous        │
├────┼────────┼───────────┼─────────┼────────────────┤
│  2 │  -1.70 │ giddy     │    1.69 │ stinks         │
├────┼────────┼───────────┼─────────┼────────────────┤
│  3 │  -1.59 │ swimfan   │    1.70 │ distasteful    │
├────┼────────┼───────────┼─────────┼────────────────┤
│  4 │  -1.57 │ blue      │    1.71 │ unwatchable    │
├────┼────────┼───────────┼─────────┼────────────────┤
│  5 │  -1.49 │ dogtown   │    1.73 │ disappointment │
├────┼────────┼───────────┼─────────┼────────────────┤
│  6 │  -1.43 │ victim    │    1.76 │ unbearable     │
├────┼────────┼───────────┼─────────┼────────────────┤
│  7 │  -1.42 │ joan      │    1.81 │ stinker        │
├────┼────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183


NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not. 

## TEST 2 -- MNB & SVM with Vectorizer 2



In [6]:
vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df

╒════╤════════╤═════════════╤═════════╤════════════╕
│    │   Most │ Likely      │   Least │ Likely     │
╞════╪════════╪═════════════╪═════════╪════════════╡
│  0 │ -10.47 │ aaliyah     │   -5.94 │ time       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  1 │ -10.47 │ abagnale    │   -5.93 │ minutes    │
├────┼────────┼─────────────┼─────────┼────────────┤
│  2 │ -10.47 │ abandoned   │   -5.92 │ characters │
├────┼────────┼─────────────┼─────────┼────────────┤
│  3 │ -10.47 │ abbreviated │   -5.92 │ story      │
├────┼────────┼─────────────┼─────────┼────────────┤
│  4 │ -10.47 │ abel        │   -5.90 │ comedy     │
├────┼────────┼─────────────┼─────────┼────────────┤
│  5 │ -10.47 │ abhors      │   -5.69 │ just       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  6 │ -10.47 │ abiding     │   -5.19 │ like       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  7 │ -10.47 │ ably        │   -5.06 │ bad        │
├────┼────────┼─────────────┼─────────┼───────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978


In [7]:
vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df

╒════╤════════╤═══════════╤═════════╤════════════════╕
│    │   Most │ Likely    │   Least │ Likely         │
╞════╪════════╪═══════════╪═════════╪════════════════╡
│  0 │  -1.81 │ hawke     │    1.63 │ cesspool       │
├────┼────────┼───────────┼─────────┼────────────────┤
│  1 │  -1.71 │ collar    │    1.66 │ pompous        │
├────┼────────┼───────────┼─────────┼────────────────┤
│  2 │  -1.69 │ giddy     │    1.69 │ stinks         │
├────┼────────┼───────────┼─────────┼────────────────┤
│  3 │  -1.59 │ swimfan   │    1.70 │ distasteful    │
├────┼────────┼───────────┼─────────┼────────────────┤
│  4 │  -1.57 │ blue      │    1.71 │ unwatchable    │
├────┼────────┼───────────┼─────────┼────────────────┤
│  5 │  -1.45 │ dogtown   │    1.72 │ disappointment │
├────┼────────┼───────────┼─────────┼────────────────┤
│  6 │  -1.41 │ victim    │    1.76 │ unbearable     │
├────┼────────┼───────────┼─────────┼────────────────┤
│  7 │  -1.41 │ joan      │    1.81 │ disgusting     │
├────┼────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503


## TEST 3 -- MNB & SVM with Vectorizer 3


In [8]:
vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df

╒════╤════════╤═════════════╤═════════╤════════════╕
│    │   Most │ Likely      │   Least │ Likely     │
╞════╪════════╪═════════════╪═════════╪════════════╡
│  0 │ -10.48 │ aaliyah     │   -5.93 │ time       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  1 │ -10.48 │ abagnale    │   -5.92 │ characters │
├────┼────────┼─────────────┼─────────┼────────────┤
│  2 │ -10.48 │ abandoned   │   -5.91 │ minutes    │
├────┼────────┼─────────────┼─────────┼────────────┤
│  3 │ -10.48 │ abbreviated │   -5.91 │ story      │
├────┼────────┼─────────────┼─────────┼────────────┤
│  4 │ -10.48 │ abel        │   -5.90 │ comedy     │
├────┼────────┼─────────────┼─────────┼────────────┤
│  5 │ -10.48 │ abhors      │   -5.68 │ just       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  6 │ -10.48 │ abiding     │   -5.13 │ like       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  7 │ -10.48 │ ably        │   -4.97 │ bad        │
├────┼────────┼─────────────┼─────────┼───────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658


In [9]:
vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df

╒════╤════════╤══════════════╤═════════╤════════════════╕
│    │   Most │ Likely       │   Least │ Likely         │
╞════╪════════╪══════════════╪═════════╪════════════════╡
│  0 │  -1.80 │ hawke        │    1.63 │ cesspool       │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  1 │  -1.73 │ giddy        │    1.65 │ disappointment │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  2 │  -1.70 │ collar       │    1.66 │ pompous        │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  3 │  -1.58 │ swimfan      │    1.67 │ stinks         │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  4 │  -1.57 │ blue         │    1.69 │ unwatchable    │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  5 │  -1.45 │ dogtown      │    1.70 │ distasteful    │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  6 │  -1.40 │ clamoring    │    1.75 │ unbearable     │
├────┼────────┼──────────────┼─────────┼────────────────┤
│  7 │  -1.40 

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815


## TEST 4 -- MNB & SVM with Vectorizer 4


In [10]:
vec = bigram_cv
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df

╒════╤════════╤═════════════╤═════════╤════════════╕
│    │   Most │ Likely      │   Least │ Likely     │
╞════╪════════╪═════════════╪═════════╪════════════╡
│  0 │ -11.17 │ 10 course   │   -6.63 │ time       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  1 │ -11.17 │ 10 year     │   -6.62 │ characters │
├────┼────────┼─────────────┼─────────┼────────────┤
│  2 │ -11.17 │ 100 minute  │   -6.61 │ minutes    │
├────┼────────┼─────────────┼─────────┼────────────┤
│  3 │ -11.17 │ 100 years   │   -6.61 │ story      │
├────┼────────┼─────────────┼─────────┼────────────┤
│  4 │ -11.17 │ 101 minutes │   -6.60 │ comedy     │
├────┼────────┼─────────────┼─────────┼────────────┤
│  5 │ -11.17 │ 101 premise │   -6.38 │ just       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  6 │ -11.17 │ 102         │   -5.83 │ like       │
├────┼────────┼─────────────┼─────────┼────────────┤
│  7 │ -11.17 │ 102 minute  │   -5.66 │ bad        │
├────┼────────┼─────────────┼─────────┼───────



╒════╤════════╤═════════════════╤═════════╤═════════════════════╕
│    │   Most │ Likely          │   Least │ Likely              │
╞════╪════════╪═════════════════╪═════════╪═════════════════════╡
│  0 │  -2.01 │ good good       │    1.74 │ charm laughs        │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  1 │  -1.99 │ director ca     │    1.75 │ unappealing         │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  2 │  -1.82 │ variation       │    1.76 │ unwatchable         │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  3 │  -1.73 │ bad cinema      │    1.80 │ unbearable          │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  4 │  -1.60 │ acting ensemble │    1.80 │ waste               │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  5 │  -1.57 │ swimfan         │    1.81 │ utterly incompetent │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  6 │  -1

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094


In [11]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094


## TEST 5 -- MNB & SVM with Vectorizer 5


In [12]:
vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.16 │ aaliyah           │   -6.62 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.16 │ abagnale          │   -6.61 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.16 │ abagnale antics   │   -6.60 │ minutes    │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.16 │ abandon political │   -6.60 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.16 │ abandoned         │   -6.59 │ comedy     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.16 │ abbreviated       │   -6.37 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.16 │ abel              │   -5.82 │ like       │
├────┼────────┼───────────────────┼─────────┼───────────



╒════╤════════╤═════════════════╤═════════╤═════════════════════╕
│    │   Most │ Likely          │   Least │ Likely              │
╞════╪════════╪═════════════════╪═════════╪═════════════════════╡
│  0 │  -2.02 │ director ca     │    1.75 │ unappealing         │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  1 │  -1.98 │ good good       │    1.75 │ charm laughs        │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  2 │  -1.83 │ variation       │    1.76 │ unwatchable         │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  3 │  -1.73 │ bad cinema      │    1.80 │ unbearable          │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  4 │  -1.59 │ acting ensemble │    1.81 │ utterly incompetent │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  5 │  -1.57 │ swimfan         │    1.81 │ waste               │
├────┼────────┼─────────────────┼─────────┼─────────────────────┤
│  6 │  -1

In [13]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094
8,mnb,V5,0.598151
9,svm,V5,0.630334


## TEST 6 -- MNB & SVM with Vectorizer 6

In [14]:
vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -9.96 │ 102      │   -6.65 │ time     │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -9.96 │ 10th     │   -6.62 │ does     │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -9.96 │ 127      │   -6.60 │ minutes  │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -9.96 │ 13th     │   -6.52 │ dull     │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -9.96 │ 14       │   -6.36 │ just     │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -9.96 │ 16       │   -6.13 │ worst    │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -9.96 │ 163      │   -6.03 │ like     │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -9.96 │ 168      │   -5.79 │ film     │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -9.96 │ 170      │   -5.41 │ bad      │
├────┼────────┼──────────┼─────────┼────

╒════╤════════╤═══════════════╤═════════╤════════════╕
│    │   Most │ Likely        │   Least │ Likely     │
╞════╪════════╪═══════════════╪═════════╪════════════╡
│  0 │  -2.50 │ flatfooted    │    1.65 │ sunday     │
├────┼────────┼───────────────┼─────────┼────────────┤
│  1 │  -2.42 │ insensitivity │    1.70 │ stammers   │
├────┼────────┼───────────────┼─────────┼────────────┤
│  2 │  -2.42 │ magnificent   │    1.76 │ iris       │
├────┼────────┼───────────────┼─────────┼────────────┤
│  3 │  -2.32 │ delightful    │    1.79 │ fashioning │
├────┼────────┼───────────────┼─────────┼────────────┤
│  4 │  -2.26 │ elegantly     │    2.07 │ unseemly   │
├────┼────────┼───────────────┼─────────┼────────────┤
│  5 │  -2.24 │ pretend       │    2.07 │ pint       │
├────┼────────┼───────────────┼─────────┼────────────┤
│  6 │  -2.23 │ terrific      │    2.16 │ cunning    │
├────┼────────┼───────────────┼─────────┼────────────┤
│  7 │  -2.21 │ household     │    2.32 │ tidings    │
├────┼────

In [15]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094
8,mnb,V5,0.598151
9,svm,V5,0.630334


## TEST 7 -- MNB & SVM with Vectorizer 7

In [16]:
vec = unigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

╒════╤════════╤═════════════╤═════════╤══════════╕
│    │   Most │ Likely      │   Least │ Likely   │
╞════╪════════╪═════════════╪═════════╪══════════╡
│  0 │  -9.95 │ aaliyah     │   -6.61 │ long     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  1 │  -9.95 │ abagnale    │   -6.61 │ does     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  2 │  -9.95 │ abandoned   │   -6.51 │ dull     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  3 │  -9.95 │ abbreviated │   -6.46 │ minutes  │
├────┼────────┼─────────────┼─────────┼──────────┤
│  4 │  -9.95 │ abel        │   -6.35 │ just     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  5 │  -9.95 │ abhors      │   -6.10 │ worst    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  6 │  -9.95 │ abiding     │   -6.00 │ like     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  7 │  -9.95 │ ably        │   -5.78 │ film     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  8 │  -9.95 │ aborted     │  

╒════╤════════╤═══════════════╤═════════╤════════════╕
│    │   Most │ Likely        │   Least │ Likely     │
╞════╪════════╪═══════════════╪═════════╪════════════╡
│  0 │  -2.50 │ flatfooted    │    1.65 │ sunday     │
├────┼────────┼───────────────┼─────────┼────────────┤
│  1 │  -2.42 │ magnificent   │    1.69 │ stammers   │
├────┼────────┼───────────────┼─────────┼────────────┤
│  2 │  -2.41 │ insensitivity │    1.76 │ iris       │
├────┼────────┼───────────────┼─────────┼────────────┤
│  3 │  -2.32 │ delightful    │    1.82 │ fashioning │
├────┼────────┼───────────────┼─────────┼────────────┤
│  4 │  -2.26 │ elegantly     │    2.06 │ unseemly   │
├────┼────────┼───────────────┼─────────┼────────────┤
│  5 │  -2.20 │ terrific      │    2.08 │ pint       │
├────┼────────┼───────────────┼─────────┼────────────┤
│  6 │  -2.20 │ household     │    2.16 │ cunning    │
├────┼────────┼───────────────┼─────────┼────────────┤
│  7 │  -2.18 │ masterfully   │    2.31 │ tidings    │
├────┼────

In [17]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094
8,mnb,V5,0.598151
9,svm,V5,0.630334


## TEST 8 -- MNB & SVM with Vectorizer 8

In [18]:
vec = bigram_tv
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

╒════╤════════╤═════════════╤═════════╤══════════╕
│    │   Most │ Likely      │   Least │ Likely   │
╞════╪════════╪═════════════╪═════════╪══════════╡
│  0 │ -10.73 │ 10 course   │   -7.67 │ story    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  1 │ -10.73 │ 10 year     │   -7.65 │ stupid   │
├────┼────────┼─────────────┼─────────┼──────────┤
│  2 │ -10.73 │ 100 minute  │   -7.62 │ mess     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  3 │ -10.73 │ 100 years   │   -7.50 │ dull     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  4 │ -10.73 │ 101 minutes │   -7.39 │ just     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  5 │ -10.73 │ 101 premise │   -7.21 │ worst    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  6 │ -10.73 │ 102         │   -7.11 │ like     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  7 │ -10.73 │ 102 minute  │   -6.85 │ film     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  8 │ -10.73 │ 10th        │  

╒════╤════════╤══════════════╤═════════╤════════════════════════╕
│    │   Most │ Likely       │   Least │ Likely                 │
╞════╪════════╪══════════════╪═════════╪════════════════════════╡
│  0 │  -2.81 │ remarkable   │    1.63 │ like big               │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  1 │  -2.75 │ perfect      │    1.64 │ dramatic constructs    │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  2 │  -2.73 │ beautifully  │    1.64 │ oscar make             │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  3 │  -2.64 │ delightful   │    1.67 │ age film               │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  4 │  -2.59 │ terrific     │    1.71 │ awful lot              │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  5 │  -2.51 │ stunning     │    1.75 │ cunning                │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  6 │  -2

In [19]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094
8,mnb,V5,0.598151
9,svm,V5,0.630334


## TEST 9 -- MNB & SVM with Vectorizer 9

In [20]:
vec = bigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

╒════╤════════╤═══════════════════╤═════════╤══════════╕
│    │   Most │ Likely            │   Least │ Likely   │
╞════╪════════╪═══════════════════╪═════════╪══════════╡
│  0 │ -10.72 │ aaliyah           │   -7.65 │ stupid   │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  1 │ -10.72 │ abagnale          │   -7.62 │ mess     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  2 │ -10.72 │ abagnale antics   │   -7.55 │ minutes  │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  3 │ -10.72 │ abandon political │   -7.49 │ dull     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  4 │ -10.72 │ abandoned         │   -7.38 │ just     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  5 │ -10.72 │ abbreviated       │   -7.19 │ worst    │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  6 │ -10.72 │ abel              │   -7.09 │ like     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  7 │ -10.72 │ abel ferrara   

╒════╤════════╤═════════════════════════╤═════════╤══════════════════╕
│    │   Most │ Likely                  │   Least │ Likely           │
╞════╪════════╪═════════════════════════╪═════════╪══════════════════╡
│  0 │  -2.21 │ wo feel                 │    2.14 │ delivered mr     │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  1 │  -2.01 │ unlikable uninteresting │    2.17 │ sadly            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  2 │  -1.76 │ way does                │    2.19 │ want think       │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  3 │  -1.76 │ contrived overblown     │    2.20 │ overbearing      │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  4 │  -1.64 │ justice awfulness       │    2.21 │ padded           │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  5 │  -1.64 │ willing claustrophobic  │    2.25 │ muddy            │
├────┼

In [21]:
df

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.606401
1,svm,V1,0.624183
2,mnb,V2,0.606978
3,svm,V2,0.624503
4,mnb,V3,0.606658
5,svm,V3,0.623815
6,mnb,V4,0.597382
7,svm,V4,0.630094
8,mnb,V5,0.598151
9,svm,V5,0.630334


In [22]:
pred_vec = bigram_cv_v2

test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId'].values
k_text = test['Phrase'].values

k_vec = bigram_cv_v2.transform(k_text)
k_vec

def get_kaggle_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def do_the_kaggle(X,y,vec):
    X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
    svm_clf = LinearSVC(C=1)
    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
    kaggle_submission = zip(k_id, prediction)
    outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
    outf.write('PhraseId,Sentiment\n')
    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
    outf.close()
    print('prediction complete')

do_the_kaggle(X,y,bigram_cv_v2)



ValueError: X has 34279 features per sample; expecting 43432

In [None]:
df