In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler

In [286]:
# set random seed
np.random.seed=42

In [3]:
# read in data from NLP
df = pd.read_csv('./data/python1.csv')

In [5]:
# reminder of what the data looks like
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   python       2000 non-null   int64  
 1   selftext     1226 non-null   object 
 2   title        2000 non-null   object 
 3   self_pol     2000 non-null   float64
 4   self_sub     2000 non-null   float64
 5   title_pol    2000 non-null   float64
 6   title_sub    2000 non-null   float64
 7   title_words  2000 non-null   float64
 8   self_words   2000 non-null   float64
 9   words        2000 non-null   float64
 10  sentences    2000 non-null   float64
 11  text_pol     2000 non-null   float64
 12  text_sub     2000 non-null   float64
 13  text         2000 non-null   object 
dtypes: float64(10), int64(1), object(3)
memory usage: 218.9+ KB


# Logistic Regression(CVEC)

In [5]:
# create X, y for modeling
X = df['text']
y = df['python']

In [253]:
# TTS with random state so all models use the same splits
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [263]:
# create pipeline with vectorizer to tokenize and vectorize, will model with LogisticRegression
lr_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(n_jobs=-1))
     ])

In [264]:
# turns out the default params work best
params = {
#     'cvec__stop_words': [None, 'english'],
#     'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
#     'cvec__min_df': [2, 3],
#     'cvec__max_df': [.9, .95],
#     'cvec__ngram_range': [(1,1), (1,2)]
    'lr__C':np.linspace(0.1, 1, 20),
    'lr__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [265]:
# run gridsearch to tune hyperparameters
gs = GridSearchCV(lr_cvec_pipe, param_grid=params, cv=5)

In [266]:
# fit training data on best model
gs.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression(n_jobs=-1))]),
             param_grid={'lr__C': array([0.1       , 0.14736842, 0.19473684, 0.24210526, 0.28947368,
       0.33684211, 0.38421053, 0.43157895, 0.47894737, 0.52631579,
       0.57368421, 0.62105263, 0.66842105, 0.71578947, 0.76315789,
       0.81052632, 0.85789474, 0.90526316, 0.95263158, 1.        ]),
                         'lr__solver': ['newton-cg', 'lbfgs', 'liblinear',
                                        'sag', 'saga']})

In [267]:
gs.best_estimator_

Pipeline(steps=[('cvec', CountVectorizer()),
                ('lr',
                 LogisticRegression(C=0.7631578947368421, n_jobs=-1,
                                    solver='sag'))])

In [268]:
# get best score for
gs.score(X_train, y_train)

0.936

In [269]:
gs.score(X_test, y_test)

0.864

In [270]:
cross_val_score(lr_cvec_pipe, X_train, y_train, cv=5).mean()

0.8733333333333333

In [271]:
cross_val_score(lr_cvec_pipe, X_test, y_test, cv=5).mean()

0.842

>## Model appears to be a little overfit, could probably bring the variance down but I'm going to use voting classifier so I'm short on time and hopefully the variance is worked out by voting

# Logistic Regression(TFID)

In [272]:
X = df['text']
y = df['python']

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [274]:
# same as lr_cvec_pipe, except we are using TfidVectorizer instead of CountVectorizer.
lr_tfid_pipe = Pipeline([
    ('tfid', TfidfVectorizer(max_features=5000, min_df=2, max_df=.7, ngram_range=(1,2))),
    ('lr', LogisticRegression(solver='newton-cg',n_jobs=-1))
     ])

In [275]:
## Do Not run, already tuned hyperparameters
params = {
#     'tfid__stop_words': [None, 'english'],
#     'tfid__max_features': [5000, 5500],
#     'tfid__min_df': [2],
#     'tfid__max_df': [.7, .65, .6,],
#     'tfid__ngram_range': [(1,3), (1,2)]
#     'lr__C':np.linspace(0.1, 1, 20),
#     'lr__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

In [276]:
gs = GridSearchCV(lr_tfid_pipe, param_grid=params, cv=5)

In [277]:
# fit training data
gs.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfid',
                                        TfidfVectorizer(max_df=0.7,
                                                        max_features=5000,
                                                        min_df=2,
                                                        ngram_range=(1, 2))),
                                       ('lr', LogisticRegression(n_jobs=-1))]),
             param_grid={'lr__C': array([0.1       , 0.14736842, 0.19473684, 0.24210526, 0.28947368,
       0.33684211, 0.38421053, 0.43157895, 0.47894737, 0.52631579,
       0.57368421, 0.62105263, 0.66842105, 0.71578947, 0.76315789,
       0.81052632, 0.85789474, 0.90526316, 0.95263158, 1.        ]),
                         'lr__solver': ['newton-cg', 'lbfgs', 'liblinear',
                                        'sag', 'saga']})

In [279]:
gs.best_estimator_

Pipeline(steps=[('tfid',
                 TfidfVectorizer(max_df=0.7, max_features=5000, min_df=2,
                                 ngram_range=(1, 2))),
                ('lr',
                 LogisticRegression(C=0.9052631578947369, n_jobs=-1,
                                    solver='newton-cg'))])

In [280]:
gs.score(X_train, y_train)

0.9773333333333334

In [281]:
gs.score(X_test, y_test)

0.886

In [282]:
cross_val_score(lr_tfid_pipe, X_train, y_train, cv=5).mean()

0.8960000000000001

In [283]:
cross_val_score(lr_tfid_pipe, X_test, y_test, cv=5).mean()

0.85

>## Just like the other Logistic Regression model, this one appears to be a little overfit, could probably bring the variance down but I'm going to use voting classifier so I'm short on time and hopefully the variance is worked out by voting

# Random Forest Classifier(CVEC)

In [23]:
X = df['text']
y = df['python']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [25]:
# same as lr_cvec_pipe, except we are using a RandomForest Classifier and CountVectorizer.
rf_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer(max_features=4000, min_df=3, max_df=.9)),
    ('rf', RandomForestClassifier(n_estimators=128, max_depth=9,n_jobs=-1))
])

In [26]:
## Do NOT run, over 5000 models so I ran it once using my own linux server. Got best params from there.
# params = {
#     'rf__n_estimators':[126,127,128],
#     'rf__max_features':[None, 'auto'],
#     'rf__max_depth':[7,8,9],
#     'cvec__stop_words': [None, 'english'],
#     'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
#     'cvec__min_df': [2, 3],
#     'cvec__max_df': [.9, .95],
#     'cvec__ngram_range': [(1,1), (1,2)]
# }

In [27]:
rf_cvec_pipe.fit(X_train, y_train)

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.9, max_features=4000, min_df=3)),
                ('rf',
                 RandomForestClassifier(max_depth=9, n_estimators=128,
                                        n_jobs=-1))])

In [29]:
cross_val_score(rf_cvec_pipe, X_train, y_train, cv=5).mean()

0.8466666666666667

In [31]:
cross_val_score(rf_cvec_pipe, X_test, y_test, cv=5).mean()

0.844

>## Unlike the LogisticRegression models, this one is not overfit, the variance is minimal. 

# Random Forest Classifier(TFID)

In [32]:
X = df['text']
y = df['python']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [34]:
# same as lr_cvec_pipe, except we are using a RandomForest Classifier and TfidVectorizer.
rf_tfid_pipe = Pipeline([
    ('tfid', TfidfVectorizer(max_features=4000, min_df=3, max_df=.9)),
    ('rf', RandomForestClassifier(n_estimators=128, max_depth=9,n_jobs=-1))
])

In [35]:
## Do NOT run, over 5000 models so I ran it once using my own linux server. Got best params from there.
# params = {
#     'rf__n_estimators':[126,127,128],
#     'rf__max_features':[None, 'auto'],
#     'rf__max_depth':[7,8,9],
#     'cvec__stop_words': [None, 'english'],
#     'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
#     'cvec__min_df': [2, 3],
#     'cvec__max_df': [.9, .95],
#     'cvec__ngram_range': [(1,1), (1,2)]
# }

In [36]:
rf_tfid_pipe.fit(X_train, y_train)

Pipeline(steps=[('tfid',
                 TfidfVectorizer(max_df=0.9, max_features=4000, min_df=3)),
                ('rf',
                 RandomForestClassifier(max_depth=9, n_estimators=128,
                                        n_jobs=-1))])

In [37]:
rf_tfid_pipe.score(X_train, y_train)

0.9373333333333334

In [38]:
cross_val_score(rf_tfid_pipe, X_train, y_train, cv=5).mean()

0.8626666666666667

In [39]:
rf_tfid_pipe.score(X_test, y_test)

0.832

In [40]:
cross_val_score(rf_tfid_pipe, X_test, y_test, cv=5).mean()

0.8160000000000001

>## Like the other RandomForest model, this one is not overfit, the variance is minimal. 

# MN Naive Bayes(CVEC)

In [47]:
X = df['text']
y = df['python']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [49]:
# same as lr_cvec_pipe, except we are using a Naive Bayes Classifier and CountVectorizer.
nb_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer(max_features=4000, min_df=3, max_df=.9)),
    ('rf', MultinomialNB())
])

In [50]:
nb_cvec_pipe.fit(X_train, y_train)

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.9, max_features=4000, min_df=3)),
                ('rf', MultinomialNB())])

In [51]:
nb_cvec_pipe.score(X_train, y_train)

0.932

In [53]:
nb_cvec_pipe.score(X_test, y_test)

0.87

In [52]:
cross_val_score(nb_cvec_pipe, X_train, y_train, cv=5).mean()

0.8733333333333333

In [54]:
cross_val_score(nb_cvec_pipe, X_test, y_test, cv=5).mean()

0.8019999999999999

>## Just like the Logistic Regression models, this one appears to be a little overfit, could probably bring the variance down but I'm going to use voting classifier so I'm short on time and hopefully the variance is worked out by voting

# MN Naive Bayes(TFID)

In [55]:
X = df['text']
y = df['python']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [57]:
# same as lr_cvec_pipe, except we are using a Naive Bayes Classifier and TfidVectorizer.
nb_tfid_pipe = Pipeline([
    ('tfid', TfidfVectorizer(max_features=4000, min_df=3, max_df=.9)),
    ('rf', MultinomialNB())
])

In [58]:
nb_tfid_pipe.fit(X_train, y_train)

Pipeline(steps=[('tfid',
                 TfidfVectorizer(max_df=0.9, max_features=4000, min_df=3)),
                ('rf', MultinomialNB())])

In [59]:
nb_tfid_pipe.score(X_train, y_train)

0.9473333333333334

In [60]:
nb_tfid_pipe.score(X_test, y_test)

0.85

In [61]:
cross_val_score(nb_tfid_pipe, X_train, y_train, cv=5).mean()

0.8626666666666667

In [62]:
cross_val_score(nb_tfid_pipe, X_test, y_test, cv=5).mean()

0.842

>## Unlike the other Naive Bayes model, this one appears to not be overfit

# Voting Classifier

>### Instantiating a VotingClassifier which uses all of our previous models and has them vote on the correct classification. There are two options, a 'hard' vote and a 'soft' vote. I borrowed some code from sklearn's documentation on VotingClassifiers. https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier

In [63]:
# voting = 'hard'
vote_hard = VotingClassifier(estimators=[('lr_cvec', lr_cvec_pipe), ('lr_tfid', lr_tfid_pipe), ('rf_cvec', rf_cvec_pipe), ('rf_tfid', rf_tfid_pipe), 
                                    ('nb_cvec', nb_cvec_pipe), ('nb_tfid', nb_tfid_pipe)], n_jobs=-1, voting='hard')

In [64]:
# voting = 'hard'
# the models are zipped into tuples of (model, 'name')
# iterate through tuples and for each model, we are printing the cross_val_score on training data
# for loop code from https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier
for clf, label in zip([lr_cvec_pipe, lr_tfid_pipe, rf_cvec_pipe, rf_tfid_pipe, nb_cvec_pipe, nb_tfid_pipe, vote_hard], 
                      ['Logistic Regression(CVEC)','Logistic Regression(TFID)', 'Random Forest(CVEC)',
                       'Random Forest(TFID)', 'Naive Bayes(CVEC)', 'Naive Bayes(TFID)', 'Ensemble']):
    train_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    print("Training Accuracy: %0.2f (+/- %0.2f) [%s]" % (train_scores.mean(), train_scores.std(), label))
    
print('') # print line for space
    
for clf, label in zip([lr_cvec_pipe, lr_tfid_pipe, rf_cvec_pipe, rf_tfid_pipe, nb_cvec_pipe, nb_tfid_pipe, vote_hard], 
                      ['Logistic Regression(CVEC)','Logistic Regression(TFID)', 'Random Forest(CVEC)',
                       'Random Forest(TFID)', 'Naive Bayes(CVEC)', 'Naive Bayes(TFID)', 'Ensemble']):
    test_scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
    print("Testing Accuracy: %0.2f (+/- %0.2f) [%s]" % (test_scores.mean(), test_scores.std(), label))

Training Accuracy: 0.87 (+/- 0.02) [Logistic Regression(CVEC)]
Training Accuracy: 0.90 (+/- 0.03) [Logistic Regression(TFID)]
Training Accuracy: 0.86 (+/- 0.02) [Random Forest(CVEC)]
Training Accuracy: 0.85 (+/- 0.03) [Random Forest(TFID)]
Training Accuracy: 0.87 (+/- 0.02) [Naive Bayes(CVEC)]
Training Accuracy: 0.86 (+/- 0.02) [Naive Bayes(TFID)]
Training Accuracy: 0.90 (+/- 0.03) [Ensemble]

Testing Accuracy: 0.84 (+/- 0.04) [Logistic Regression(CVEC)]
Testing Accuracy: 0.85 (+/- 0.02) [Logistic Regression(TFID)]
Testing Accuracy: 0.84 (+/- 0.03) [Random Forest(CVEC)]
Testing Accuracy: 0.82 (+/- 0.03) [Random Forest(TFID)]
Testing Accuracy: 0.80 (+/- 0.03) [Naive Bayes(CVEC)]
Testing Accuracy: 0.84 (+/- 0.02) [Naive Bayes(TFID)]
Testing Accuracy: 0.85 (+/- 0.02) [Ensemble]


>## As you can see, the Ensemble model with 'hard' voting was no better than the Logistic Regression(TFID) model.

In [67]:
vote_soft = VotingClassifier(estimators=[('lr_cvec', lr_cvec_pipe), ('lr_tfid', lr_tfid_pipe), ('rf_cvec', rf_cvec_pipe), ('rf_tfid', rf_tfid_pipe), 
                                    ('nb_cvec', nb_cvec_pipe), ('nb_tfid', nb_tfid_pipe)], n_jobs=-1, voting='soft')

In [68]:
# voting = 'soft'
# the models are zipped into tuples of (model, 'name')
# iterate through tuples and for each model, we are printing the cross_val_score on training data
# for loop code from https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier
for clf, label in zip([lr_cvec_pipe, lr_tfid_pipe, rf_cvec_pipe, rf_tfid_pipe, nb_cvec_pipe, nb_tfid_pipe, vote_soft], 
                      ['Logistic Regression(CVEC)','Logistic Regression(TFID)', 'Random Forest(CVEC)',
                       'Random Forest(TFID)', 'Naive Bayes(CVEC)', 'Naive Bayes(TFID)', 'Ensemble']):
    train_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    print("Training Accuracy: %0.2f (+/- %0.2f) [%s]" % (train_scores.mean(), train_scores.std(), label))
    
print('') # print line for space

vote_soft = VotingClassifier(estimators=[('lr_cvec', lr_cvec_pipe), ('lr_tfid', lr_tfid_pipe), ('rf_cvec', rf_cvec_pipe), ('rf_tfid', rf_tfid_pipe), 
                                    ('nb_cvec', nb_cvec_pipe), ('nb_tfid', nb_tfid_pipe)], n_jobs=-1, voting='soft')
# voting = 'soft'
for clf, label in zip([lr_cvec_pipe, lr_tfid_pipe, rf_cvec_pipe, rf_tfid_pipe, nb_cvec_pipe, nb_tfid_pipe, vote_soft], 
                      ['Logistic Regression(CVEC)','Logistic Regression(TFID)', 'Random Forest(CVEC)',
                       'Random Forest(TFID)', 'Naive Bayes(CVEC)', 'Naive Bayes(TFID)', 'Ensemble']):
    test_scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
    print("Testing Accuracy: %0.2f (+/- %0.2f) [%s]" % (test_scores.mean(), test_scores.std(), label))

Training Accuracy: 0.87 (+/- 0.02) [Logistic Regression(CVEC)]
Training Accuracy: 0.90 (+/- 0.03) [Logistic Regression(TFID)]
Training Accuracy: 0.86 (+/- 0.03) [Random Forest(CVEC)]
Training Accuracy: 0.86 (+/- 0.03) [Random Forest(TFID)]
Training Accuracy: 0.87 (+/- 0.02) [Naive Bayes(CVEC)]
Training Accuracy: 0.86 (+/- 0.02) [Naive Bayes(TFID)]
Training Accuracy: 0.90 (+/- 0.03) [Ensemble]

Testing Accuracy: 0.84 (+/- 0.04) [Logistic Regression(CVEC)]
Testing Accuracy: 0.85 (+/- 0.02) [Logistic Regression(TFID)]
Testing Accuracy: 0.83 (+/- 0.02) [Random Forest(CVEC)]
Testing Accuracy: 0.82 (+/- 0.04) [Random Forest(TFID)]
Testing Accuracy: 0.80 (+/- 0.03) [Naive Bayes(CVEC)]
Testing Accuracy: 0.84 (+/- 0.02) [Naive Bayes(TFID)]
Testing Accuracy: 0.86 (+/- 0.03) [Ensemble]


>## As you can see, the Ensemble model with 'soft' voting was marginally better on testing data than the other models, though still falling within std.

# Get Key terms for each language:

In [303]:
## from one of the pipelines, take the feature_importances and feature_names so we can pair them to find key words
imp = rf_cvec_pipe['rf'].feature_importances_
features = rf_cvec_pipe['cvec'].get_feature_names()

In [304]:
# zip them together into a dictionary so we can sort, then iterate through
data = dict(zip(features, imp))

In [305]:
# sort the data, choose top 100, store as key_terms
key_terms = sorted(data, key=data.get, reverse=True)[:50]

In [306]:
# initiate Counter method on key_terms to make a dictionary of key_terms
c = Counter(key_terms)
# combine, then tokenize the entire text of the python class
python_words = df.loc[df.python == 1].text.str.cat(sep=(','))
python_words = python_words.split(' ')
python_words = ' '.join(python_words).split()
# iterate through words in python class, match with key_words and add to the counter
for i in python_words:
    if i in c.keys():
        c[i] += 1
    python_terms = c

In [307]:
# create dataframe from dictionary and save for use in visualizations
python_terms_df = pd.DataFrame(python_terms.items(), columns=['Term','Count'])

# python_terms_df.to_csv('./data/python_terms.csv', index=False)

In [308]:
# initiate Counter method on key_terms to make a dictionary of key_terms
c = Counter(key_terms)
# combine, then tokenize the entire text of the go class
go_words = df.loc[df.python == 0].text.str.cat(sep=(','))
go_words = go_words.split(' ')
go_words = ' '.join(go_words).split()
# iterate through words in go class, match with key_words and add to the counter
for i in go_words:
    if i in c.keys():
        c[i] += 1
    go_terms = c

In [309]:
# create dataframe from dictionary and save for use in visualizations
go_terms_df = pd.DataFrame(go_terms.items(), columns=['Term','Count'])

# go_terms_df.to_csv('./data/go_terms.csv', index=False)

In [311]:
# combine the dataframes into one to use for visualizations

python_terms_df['python'] = 1
go_terms_df['python'] = 0

terms_df = python_terms_df.append(go_terms_df)

# terms_df.to_csv('./data/terms.csv', index=False)

In [334]:
df.groupby('python').mean()

Unnamed: 0_level_0,self_pol,self_sub,title_pol,title_sub,title_words,self_words,words,sentences,text_pol,text_sub
python,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.055741,0.208771,0.067945,0.187181,8.664,52.511,61.175,2.63,0.09628,0.314685
1,0.045397,0.186743,0.041154,0.205864,10.105,52.807,62.912,2.667,0.076247,0.321351
