In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize
from prepare import prepare

import sklearn.preprocessing
import warnings
import re

from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
# imports for modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [2]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [3]:
df = pd.DataFrame(data)

# Check out prepare for prepare details

In [4]:
train,validate,test = prepare(df)

In [5]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
63,Asabeneh/30-Days-Of-JavaScript,JavaScript,# 30 Days Of JavaScript\n\n| # Day | ...,30 days javascript day topics 01 introductionr...,30 day javascript day topic 01 introductionrea...,30 day javascript day topic 01 introductionrea...
18,MrS0m30n3/youtube-dl-gui,Python,[![Donations Badge](https://yourdonation.rocks...,donations badgehttpsyourdonationrocksimagesbad...,donat badgehttpsyourdonationrocksimagesbadgesv...,donation badgehttpsyourdonationrocksimagesbadg...
72,statianzo/Fleck,C#,Fleck\r\n===\r\n\r\n[![Build status](https://c...,fleck build statushttpsciappveyorcomapiproject...,fleck build statushttpsciappveyorcomapiproject...,fleck build statushttpsciappveyorcomapiproject...
12,ddbourgin/numpy-ml,Python,# numpy-ml\nEver wish you had an inefficient b...,numpyml ever wish inefficient somewhat legible...,numpyml ever wish ineffici somewhat legibl col...,numpyml ever wish inefficient somewhat legible...
88,mono/CppSharp,C#,CppSharp is a tool and set of libraries which ...,cppsharp tool set libraries facilitates usage ...,cppsharp tool set librari facilit usag nativ c...,cppsharp tool set library facilitates usage na...


In [6]:
train.shape, validate.shape, test.shape

((58, 6), (25, 6), (21, 6))

# No duplicates

# Exploration

In [7]:
def clean(text):
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = []
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [8]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(train, "language")

Unnamed: 0,n,percent
C#,18,0.310345
HTML,14,0.241379
JavaScript,13,0.224138
Python,13,0.224138


In [9]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
63,Asabeneh/30-Days-Of-JavaScript,JavaScript,# 30 Days Of JavaScript\n\n| # Day | ...,30 days javascript day topics 01 introductionr...,30 day javascript day topic 01 introductionrea...,30 day javascript day topic 01 introductionrea...
18,MrS0m30n3/youtube-dl-gui,Python,[![Donations Badge](https://yourdonation.rocks...,donations badgehttpsyourdonationrocksimagesbad...,donat badgehttpsyourdonationrocksimagesbadgesv...,donation badgehttpsyourdonationrocksimagesbadg...
72,statianzo/Fleck,C#,Fleck\r\n===\r\n\r\n[![Build status](https://c...,fleck build statushttpsciappveyorcomapiproject...,fleck build statushttpsciappveyorcomapiproject...,fleck build statushttpsciappveyorcomapiproject...
12,ddbourgin/numpy-ml,Python,# numpy-ml\nEver wish you had an inefficient b...,numpyml ever wish inefficient somewhat legible...,numpyml ever wish ineffici somewhat legibl col...,numpyml ever wish inefficient somewhat legible...
88,mono/CppSharp,C#,CppSharp is a tool and set of libraries which ...,cppsharp tool set libraries facilitates usage ...,cppsharp tool set librari facilit usag nativ c...,cppsharp tool set library facilitates usage na...


# Modeling

In [10]:
#Create a baseline model

print(f'Baseline Accuracy:{train.language.value_counts().idxmax()} {round(max(train.language.value_counts()) / train.shape[0] *100)}%' )

Baseline Accuracy:C# 31%


In [11]:
def vectorizer_split(x):   
    vectorizer = CountVectorizer(binary = True, stop_words = 'english')
    vectorizer.fit(list(train[x]))
    X_train = vectorizer.transform(train[x])
    X_validate= vectorizer.transform(validate[x])
    X_test = vectorizer.transform(test[x])
    return X_train.todense(),X_validate.todense(),X_test.todense()

def tfidf_split(x):   
    tfidf = TfidfVectorizer()
    tfidf.fit(list(train[x]))
    X_train = tfidf.transform(train[x])
    X_validate= tfidf.transform(validate[x])
    X_test = tfidf.transform(test[x])
    return X_train.todense(),X_validate.todense(),X_test.todense()

def test_a_model(X_train, y_train, X_validate, y_validate, model, model_name, score_df):
    '''
    Function takes in X and y train
    X and y validate (or test) 
    A model with it's hyper parameters
    And a df to store the scores 
    - Set up an empty dataframe with score_df first
    - score_df = pd.DataFrame(columns = ['model_name', 'train_score', 'validate_score'])
    '''
    this_model = model

    this_model.fit(X_train, y_train)

    # Check with Validate

    train_score = this_model.score(X_train, y_train)
    
    validate_score = this_model.score(X_validate, y_validate)
    
    model_dict = {'model_name': model_name, 
                  'train_score': train_score, 
                  'validate_score':validate_score}
    score_df = score_df.append(model_dict, ignore_index = True)
    
    return score_df




In [12]:
y_train = train.language
y_validate = validate.language
y_test = test.language
X_train,X_validate,X_test = vectorizer_split('clean')

In [13]:
X_train,X_validate,X_test = vectorizer_split('stemmed')

In [14]:
score_df = pd.DataFrame(columns = ['model_name', 'train_score', 'validate_score'])

In [15]:
## Create a for loop that creates 20 Random Forrest models with increasingly larger depths.
metrics2 = []
forest_models = []
for i in range(2, 22):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)
    
    y_predictions = forest.predict(X_train)
    y_pred = forest.predict(X_validate)
    
    # Use the model
    in_sample_accuracy = round(forest.score(X_train, y_train),3)
    
    out_of_sample_accuracy = round(forest.score(X_validate, y_validate),3)
    
    in_sample_recall = round(sklearn.metrics.recall_score(y_train, y_predictions, pos_label =0, average='micro'),3)
    
    out_of_sample_recall = round(sklearn.metrics.recall_score(y_validate, y_pred, pos_label =0, average='micro'),3)
    
    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "train_recall": in_sample_recall,
        "validate_recall": out_of_sample_recall
    }
    
    # This creates the df below
    metrics2.append(output)
    # tree_models will store all of my tree models incase i want them later
    forest_models.append(forest)
    
    
    
forest_df = pd.DataFrame(metrics2)
forest_df["accuracy_difference"] = forest_df.train_accuracy - forest_df.validate_accuracy
forest_df.style.highlight_min('accuracy_difference')

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,train_recall,validate_recall,accuracy_difference
0,2,0.741,0.28,0.741,0.28,0.461
1,3,0.862,0.48,0.862,0.48,0.382
2,4,0.897,0.48,0.897,0.48,0.417
3,5,0.931,0.52,0.931,0.52,0.411
4,6,0.966,0.52,0.966,0.52,0.446
5,7,0.966,0.52,0.966,0.52,0.446
6,8,0.983,0.6,0.983,0.6,0.383
7,9,1.0,0.52,1.0,0.52,0.48
8,10,1.0,0.56,1.0,0.56,0.44
9,11,1.0,0.56,1.0,0.56,0.44


In [16]:
from sklearn import svm
# make a list of algorithms we want to try for our models
model_list = [MultinomialNB(), LinearSVC(), DecisionTreeClassifier(), forest_models[forest_df.accuracy_difference.idxmin()], KNeighborsClassifier(), LogisticRegression(), svm.SVC(C= 2, decision_function_shape='ovo')]

# name the models
model_names = ['Naive_Bayes_stemmed_CV', 'SVC_stemmed_CV', 'Decision_tree_stemmed_CV', 'Random_forest_stemmed_CV', 'KNN_bigrams_stemmed_CV', 'Log_reg_stemmed_CV','SVC_stemmed_CV']
# Run the models
for model, name in zip(model_list, model_names):
    score_df = test_a_model(X_train, y_train, X_validate, y_validate, model, name, score_df)

In [17]:
X_train,X_validate,X_test = vectorizer_split('lemmatized')

In [18]:
from sklearn import svm
# make a list of algorithms we want to try for our models
model_list = [MultinomialNB(), LinearSVC(), DecisionTreeClassifier(), forest_models[forest_df.accuracy_difference.idxmin()], KNeighborsClassifier(), LogisticRegression(), svm.SVC(C= 2, decision_function_shape='ovo')]

# name the models
model_names = ['Naive_Bayes_lemmatized_CV', 'SVC_lemmatized_CV', 'Decision_tree_lemmatized_CV', 'Random_forest_lemmatized_CV', 'KNN_bigrams_lemmatized_CV', 'Log_reg_lemmatized_CV','SVC_lemmatized_CV']
# Run the models
for model, name in zip(model_list, model_names):
    score_df = test_a_model(X_train, y_train, X_validate, y_validate, model, name, score_df)

In [19]:
X_train,X_validate,X_test = tfidf_split('stemmed')

In [20]:
from sklearn import svm
# make a list of algorithms we want to try for our models
model_list = [MultinomialNB(), LinearSVC(), DecisionTreeClassifier(), forest_models[forest_df.accuracy_difference.idxmin()], KNeighborsClassifier(), LogisticRegression(), svm.SVC(C= 2, decision_function_shape='ovo')]

# name the models
model_names = ['Naive_Bayes_stemmed_TFIDF', 'SVC_stemmed_TFIDF', 'Decision_tree_stemmed_TFIDF', 'Random_forest_stemmed_TFIDF', 'KNN_bigrams_stemmed_TFIDF', 'Log_reg_stemmed_TFIDF','SVC_stemmed_TFIDF']
# Run the models
for model, name in zip(model_list, model_names):
    score_df = test_a_model(X_train, y_train, X_validate, y_validate, model, name, score_df)

In [21]:
X_train,X_validate,X_test = tfidf_split('lemmatized')

In [22]:
from sklearn import svm
# make a list of algorithms we want to try for our models
model_list = [MultinomialNB(), LinearSVC(), DecisionTreeClassifier(), forest_models[forest_df.accuracy_difference.idxmin()], KNeighborsClassifier(), LogisticRegression(), svm.SVC(C= 2, decision_function_shape='ovo')]

# name the models
model_names = ['Naive_Bayes_lemmatized_TFIDF', 'SVC_lemmatized_TFIDF', 'Decision_tree_lemmatized_TFIDF', 'Random_forest_lemmatized_TFIDF', 'KNN_bigrams_lemmatized_TFIDF', 'Log_reg_lemmatized_TFIDF','SVC_lemmatized_TFIDF']
# Run the models
for model, name in zip(model_list, model_names):
    score_df = test_a_model(X_train, y_train, X_validate, y_validate, model, name, score_df)

In [23]:
score_df.style.highlight_max('validate_score')

Unnamed: 0,model_name,train_score,validate_score
0,Naive_Bayes_stemmed_CV,1.0,0.52
1,SVC_stemmed_CV,1.0,0.48
2,Decision_tree_stemmed_CV,1.0,0.52
3,Random_forest_stemmed_CV,0.862069,0.48
4,KNN_bigrams_stemmed_CV,0.396552,0.2
5,Log_reg_stemmed_CV,1.0,0.64
6,SVC_stemmed_CV,0.913793,0.6
7,Naive_Bayes_lemmatized_CV,1.0,0.52
8,SVC_lemmatized_CV,1.0,0.48
9,Decision_tree_lemmatized_CV,1.0,0.56


In [24]:
## Create a for loop that creates 20 Random Forrest models with increasingly larger depths.
metrics2 = []
forest_models = []
for i in range(2, 22):
    # Make the model
    forest = RandomForestClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)
    
    y_predictions = forest.predict(X_train)
    y_pred = forest.predict(X_validate)
    
    # Use the model
    in_sample_accuracy = round(forest.score(X_train, y_train),3)
    
    out_of_sample_accuracy = round(forest.score(X_validate, y_validate),3)
    
    in_sample_recall = round(sklearn.metrics.recall_score(y_train, y_predictions, pos_label =0, average='micro'),3)
    
    out_of_sample_recall = round(sklearn.metrics.recall_score(y_validate, y_pred, pos_label =0, average='micro'),3)
    
    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy,
        "train_recall": in_sample_recall,
        "validate_recall": out_of_sample_recall
    }
    
    # This creates the df below
    metrics2.append(output)
    # tree_models will store all of my tree models incase i want them later
    forest_models.append(forest)
    
    
    
forest_df = pd.DataFrame(metrics2)
forest_df["accuracy_difference"] = forest_df.train_accuracy - forest_df.validate_accuracy
forest_df.style.heatmap()

AttributeError: 'Styler' object has no attribute 'heatmap'

In [None]:
forest_models[forest_df.accuracy_difference.idxmin()]

In [None]:
forest_df.accuracy_difference.idxmin()