# Modeling

In [1]:
import numpy as np
import pandas as pd

import unicodedata

import re

import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import acquire as ac
import prepare as pr
import preprocessing as pp
import evaluate as ev

---
## Wrangle

In [2]:
df = pr.wrangle_readme_data()
df

Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,316
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,66
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center tensorflow image tf logo soci...,1138
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public repo official list smartapp...,44
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",align center href getbootstrap getbootstrap as...,1048
...,...,...,...,...,...
210,akveo/ngx-admin,TypeScript,"# ngx-admin [<img src=""https://i.imgur.com/oMc...",ngx admin imgur omcxwz png alt eva design syst...,434
211,swirldev/swirl_courses,R,# swirl courses\n\nThis is a collection of int...,swirl course collection interactive course use...,420
212,jrowberg/i2cdevlib,C++,Jennic platform added!\n\n====================...,jennic platform added note detail project plea...,243
213,etcd-io/etcd,Go,# etcd\n\n[![Go Report Card](https://goreportc...,etcd go report card goreportcard badge etcd io...,882


In [3]:
df.columns

Index(['repo', 'language', 'readme_contents', 'clean_readme_contents',
       'len_of_clean_readme_contents'],
      dtype='object')

In [4]:
df.repo.value_counts().head(18)

fengdu78/Coursera-ML-AndrewNg-Notes                                            2
soimort/you-get                                                                2
twbs/bootstrap                                                                 2
CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers    2
SmartThingsCommunity/SmartThingsPublic                                         2
nightscout/cgm-remote-monitor                                                  2
phonegap/phonegap-start                                                        2
arduino/Arduino                                                                2
octocat/Spoon-Knife                                                            2
tensorflow/tensorflow                                                          2
crossoverJie/JCSprout                                                          2
apache/incubator-mxnet                                                         2
axios/axios                 

In [5]:
df.repo.value_counts().tail()

Snailclimb/JavaGuide                         1
ageron/handson-ml                            1
zxing/zxing                                  1
bcit-ci/CodeIgniter                          1
linuxacademy/devops-essentials-sample-app    1
Name: repo, dtype: int64

In [6]:
len(df.repo.unique())

198

In [7]:
# stratification base on language using train_test_split won't work unless we have more than one observation
# per language
df.language.value_counts()

JavaScript          45
Java                35
Python              29
C++                 18
HTML                16
Jupyter Notebook    10
PHP                  9
Go                   9
TypeScript           7
Ruby                 7
CSS                  6
C                    5
Vue                  3
R                    3
Rust                 3
Kotlin               2
Shell                2
PowerShell           2
C#                   2
Groovy               2
Name: language, dtype: int64

---
## Model

---
### Outlier Detour

In [8]:
# lemmas
# list_of_readmes = df.clean_readme_contents.tolist()
# list_of_readmes

In [9]:
# for index in range(len(list_of_readmes)):
#     list_of_readmes[index] = list_of_readmes[index].split()
    

# print(list_of_readmes)

In [10]:
# list_of_list_of_lemmas = [lemma.split() for lemmas in list_of_lemmas]
# list_of_list_of_lemmas

In [11]:
# len(list_of_readmes[0])

In [12]:
# len(list_of_readmes[-1])

In [13]:
# len_of_readmes = [len(readme) for readme in list_of_readmes]
# len_of_readmes

In [14]:
# len_of_readmes.sort()

In [15]:
# print(len_of_readmes)

**Cutoff for length of words in readme = 10**

In [16]:
index_three_list = df[df.index == 3].clean_readme_contents.values.tolist()
index_three_list

['smartthings public repo official list smartapps device type smartthings link help get started coding right away specific documentation smartthings en latest tool ide integration html full documentation smartthings ide simulator ide smartthings community forum community smartthings follow web twitter twitter smartthingsdev facebook facebook smartthingsdevelopers']

In [17]:
df[df.index == 3].len_of_clean_readme_contents

3    44
Name: len_of_clean_readme_contents, dtype: int64

**len_of_clean_readme_contents seems to be working**

In [18]:
df.len_of_clean_readme_contents.value_counts().sort_index()

12       2
13       1
19       1
22       1
44       2
        ..
7612     1
8042     1
10071    1
14505    1
22628    1
Name: len_of_clean_readme_contents, Length: 182, dtype: int64

In [19]:
df.shape

(215, 5)

**225 observations before outlier removal; should have 219 after**

**CHECK**

---
### Back on Model Track

#### Train/Test Split

In [20]:
df = pr.wrangle_readme_data()
print(df.shape)
df.head()

(215, 5)


Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,316
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,66
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center tensorflow image tf logo soci...,1138
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public repo official list smartapp...,44
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",align center href getbootstrap getbootstrap as...,1048


## Modeling

Transform your documents into a form that can be used in a machine learning model. You should use the programming language of the repository as the label to predict.

Try fitting several different models and using several different representations of the text (e.g. a simple bag of words, then also the TF-IDF values for each).

Build a function that will take in the text of a README file, and tries to predict the programming language.

**CountVectorizer**

In [21]:
# create cv object
cv = CountVectorizer()

In [22]:
# fit and use the cv object
cv_bag_of_words = cv.fit_transform(df.clean_readme_contents)

In [23]:
cv_bag_of_words

<215x21344 sparse matrix of type '<class 'numpy.int64'>'
	with 71771 stored elements in Compressed Sparse Row format>

**TF-IDF**

In [24]:
# create tfidf vectorizer object
tfidf = TfidfVectorizer()

In [25]:
tfidf_bag_of_words = tfidf.fit_transform(df.clean_readme_contents)

In [26]:
tfidf_bag_of_words

<215x21344 sparse matrix of type '<class 'numpy.float64'>'
	with 71771 stored elements in Compressed Sparse Row format>

**CountVectorizer Bag of Bigrams**

In [27]:
# create cv for bigrams
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

In [28]:
cv_bag_of_bigrams = cv_bigrams.fit_transform(df.clean_readme_contents)

In [29]:
cv_bag_of_bigrams

<215x134015 sparse matrix of type '<class 'numpy.int64'>'
	with 164928 stored elements in Compressed Sparse Row format>

**TF-IDF Bag of Bigrams**

In [30]:
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

In [31]:
tfidf_bag_of_bigrams = tfidf_bigrams.fit_transform(df.clean_readme_contents)

In [32]:
tfidf_bag_of_bigrams

<215x134015 sparse matrix of type '<class 'numpy.float64'>'
	with 164928 stored elements in Compressed Sparse Row format>

___

**Model #1: Standard CV**

In [33]:
# establish features and target for CountVectorizer model
X = cv.fit_transform(df.clean_readme_contents)
y = df.language

In [34]:
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

In [35]:
print(X_train.shape)
print(y_train.shape)
print(X_train.shape[0]/df.shape[0])
print(y_train.shape[0]/df.shape[0])

(172, 21344)
(172,)
0.8
0.8


In [36]:
print(X_test.shape)
print(y_test.shape)
print(X_test.shape[0]/df.shape[0])
print(y_test.shape[0]/df.shape[0])

(43, 21344)
(43,)
0.2
0.2


In [37]:
pd.DataFrame(X_train[:5, :].todense(), columns=cv.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaaaaac,aaaaaaaaaai,aaaaaaaad,aaaaaaaaecm,aaaaaaaaerc,aaaaaaaaex,aab,aac,...,zxingorg,zybpzd,zygmuntz,zyiot,zyiz,zynga,zypper,zyszys,zz,zzm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# create evaluation DataFrame
evaluation = pd.DataFrame(columns=["model_type", "accuracy"])

In [39]:
# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
0,"CV Gini, 5",0.575581


In [40]:
print('CV DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV DT Model Train Accuracy: 57.56%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.666667,0.0,1.0,0.0,0.192308,1.0,1.0,1.0,...,0.8125,0.0,0.0,0.0,0.0,1.0,1.0,0.575581,0.410497,0.702081
recall,0.0,0.0,0.153846,0.0,0.714286,0.0,1.0,0.64,0.710526,0.7,...,0.684211,0.0,0.0,0.0,0.0,0.571429,1.0,0.575581,0.358715,0.575581
f1-score,0.0,0.0,0.25,0.0,0.833333,0.0,0.322581,0.780488,0.830769,0.823529,...,0.742857,0.0,0.0,0.0,0.0,0.727273,1.0,0.575581,0.350542,0.583395
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.575581,172.0,172.0


In [41]:
print('CV DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV DT Model Test Accuracy: 37.21%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.041667,1.0,0.833333,0.5,0.75,0.0,0.0,0.0,0.372093,0.260417,0.56686
recall,0.0,0.0,0.0,0.0,1.0,0.3,0.714286,0.5,0.6,0.0,0.0,0.0,0.372093,0.259524,0.372093
f1-score,0.0,0.0,0.0,0.0,0.08,0.461538,0.769231,0.5,0.666667,0.0,0.0,0.0,0.372093,0.206453,0.412713
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.372093,43.0,43.0


**Model #2: TF-IDF**

In [42]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581


In [43]:
X

<215x21344 sparse matrix of type '<class 'numpy.float64'>'
	with 71771 stored elements in Compressed Sparse Row format>

In [44]:
print('TF-IDF DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF DT Model Train Accuracy: 59.88%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.666667,0.0,1.0,0.0,0.194805,1.0,1.0,1.0,...,1.0,0.0,0.0,0.666667,0.0,1.0,1.0,0.598837,0.455574,0.73259
recall,0.0,0.0,0.153846,0.0,0.714286,0.0,1.0,0.64,0.710526,0.9,...,0.684211,0.0,0.0,1.0,0.0,0.571429,1.0,0.598837,0.418715,0.598837
f1-score,0.0,0.0,0.25,0.0,0.833333,0.0,0.326087,0.780488,0.830769,0.947368,...,0.8125,0.0,0.0,0.8,0.0,0.727273,1.0,0.598837,0.402233,0.609396
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.598837,172.0,172.0


In [45]:
print('TF-IDF DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF DT Model Test Accuracy: 34.88%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.041667,1.0,0.833333,0.0,0.5,0.833333,0.0,0.0,0.0,0.348837,0.246795,0.58624
recall,0.0,0.0,0.0,0.0,1.0,0.3,0.714286,0.0,0.5,0.5,0.0,0.0,0.0,0.348837,0.231868,0.348837
f1-score,0.0,0.0,0.0,0.0,0.08,0.461538,0.769231,0.0,0.5,0.625,0.0,0.0,0.0,0.348837,0.187367,0.403023
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,0.0,2.0,10.0,1.0,1.0,1.0,0.348837,43.0,43.0


**Model #3: CV Bigrams**

In [46]:
# establish features and target for cv model

# create cv_bigrams vectorizer object
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = cv_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605


In [47]:
print('CV Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Train Accuracy: 41.86%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.2,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.418605,0.36,0.581395
recall,0.0,0.0,0.153846,0.2,0.0,0.0,0.0,1.0,0.631579,0.6,...,0.421053,0.0,0.0,0.0,0.0,0.428571,1.0,0.418605,0.221752,0.418605
f1-score,0.0,0.0,0.266667,0.333333,0.0,0.0,0.0,0.333333,0.774194,0.75,...,0.592593,0.0,0.0,0.0,0.0,0.6,1.0,0.418605,0.232506,0.400263
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.418605,172.0,172.0


In [48]:
print('CV Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Test Accuracy: 39.53%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.3125,0.833333,0.0,0.0,0.5,0.0,0.0,0.0,0.395349,0.126603,0.324612
recall,0.0,0.0,0.0,0.0,0.0,1.0,0.714286,0.0,0.0,0.2,0.0,0.0,0.0,0.395349,0.147253,0.395349
f1-score,0.0,0.0,0.0,0.0,0.0,0.47619,0.769231,0.0,0.0,0.285714,0.0,0.0,0.0,0.395349,0.11778,0.302411
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,0.0,2.0,10.0,1.0,1.0,1.0,0.395349,43.0,43.0


**Model #4: TF-IDF Bigrams**

In [49]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [50]:
print('TF-IDF Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Train Accuracy: 41.28%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.2,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.412791,0.293333,0.527132
recall,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,1.0,0.631579,0.6,...,0.421053,0.0,0.0,0.0,0.0,0.428571,1.0,0.412791,0.211752,0.412791
f1-score,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.333333,0.774194,0.75,...,0.592593,0.0,0.0,0.0,0.0,0.6,1.0,0.412791,0.215006,0.389314
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.412791,172.0,172.0


In [51]:
print('TF-IDF Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Test Accuracy: 37.21%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Rust,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.3125,0.8,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.372093,0.115179,0.319186
recall,0.0,0.0,0.0,0.0,0.0,1.0,0.571429,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.372093,0.126531,0.372093
f1-score,0.0,0.0,0.0,0.0,0.0,0.47619,0.666667,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.372093,0.102041,0.285714
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,0.0,2.0,10.0,1.0,1.0,1.0,0.0,0.372093,43.0,43.0


**Model #5: TF-IDF Unigrams & Bigrams**

In [52]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_unigrams_and_bigrams = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf_unigrams_and_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigrams & Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [53]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: 61.05%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.666667,0.0,1.0,0.0,0.202703,1.0,1.0,1.0,...,0.933333,0.0,0.0,0.5,0.0,1.0,1.0,0.610465,0.446953,0.726134
recall,0.0,0.0,0.153846,0.0,0.714286,0.0,1.0,0.76,0.710526,0.7,...,0.736842,0.0,0.0,1.0,0.0,0.571429,1.0,0.610465,0.417346,0.610465
f1-score,0.0,0.0,0.25,0.0,0.833333,0.0,0.337079,0.863636,0.830769,0.823529,...,0.823529,0.0,0.0,0.666667,0.0,0.727273,1.0,0.610465,0.39668,0.616574
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.610465,172.0,172.0


In [54]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: 39.53%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,1.0,0.058824,1.0,0.833333,0.0,0.5,0.666667,0.0,0.0,0.0,0.395349,0.312217,0.594391
recall,0.0,0.0,0.0,1.0,1.0,0.6,0.714286,0.0,0.5,0.2,0.0,0.0,0.0,0.395349,0.308791,0.395349
f1-score,0.0,0.0,0.0,1.0,0.111111,0.75,0.769231,0.0,0.5,0.307692,0.0,0.0,0.0,0.395349,0.264464,0.44355
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,0.0,2.0,10.0,1.0,1.0,1.0,0.395349,43.0,43.0


---

**Create a function to append evaluation metrics (accuracy) to evaluation dataframe**


**CHECK**

---

**Model #6: TF-IDF Logistic Regression**

In [55]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
5,TF-IDF Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [56]:
print('TF-IDF Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Logistic Regression Model Train Accuracy: 64.53%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.961538,0.391753,1.0,...,0.95,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.315165,0.592878
recall,0.0,0.0,0.692308,0.0,0.0,0.0,0.666667,1.0,1.0,0.6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.27652,0.645349
f1-score,0.0,0.0,0.818182,0.0,0.0,0.0,0.8,0.980392,0.562963,0.75,...,0.974359,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.280658,0.579317
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.645349,172.0,172.0


In [57]:
print('TF-IDF Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Logistic Regression Model Test Accuracy: 34.88%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.212121,0.0,0.6,0.0,0.0,0.0,0.348837,0.15101,0.406624
recall,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.3,0.0,0.0,0.0,0.348837,0.15,0.348837
f1-score,0.0,0.0,0.0,0.0,0.0,0.666667,0.35,0.0,0.4,0.0,0.0,0.0,0.348837,0.118056,0.305039
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.348837,43.0,43.0


In [58]:
# go_df = df[df.language == "Go"]
# go_df.to_dict("response")

**Model #7: TF-IDF Hyperparameters**

In [59]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y, train_size=.7, test_size=.3)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=2,random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Entropy, 5, 2, Train 70%", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [60]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: 66.00%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.428571,0.0,0.333333,0.0,0.387097,1.0,0.653061,1.0,...,0.923077,0.0,0.0,0.0,0.0,1.0,1.0,0.66,0.361257,0.615216
recall,0.0,0.0,0.272727,0.0,0.428571,0.0,0.923077,0.826087,0.888889,0.9,...,0.705882,0.0,0.0,0.0,0.0,0.666667,1.0,0.66,0.380595,0.66
f1-score,0.0,0.0,0.333333,0.0,0.375,0.0,0.545455,0.904762,0.752941,0.947368,...,0.8,0.0,0.0,0.0,0.0,0.8,1.0,0.66,0.356276,0.616256
support,3.0,2.0,11.0,5.0,7.0,1.0,13.0,23.0,36.0,10.0,...,17.0,1.0,6.0,1.0,1.0,3.0,3.0,0.66,150.0,150.0


In [61]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: 38.46%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,PHP,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.166667,0.0,0.0,0.0,0.142857,0.833333,0.470588,0.0,0.5,...,0.875,0.0,0.0,0.0,0.0,0.0,0.0,0.384615,0.166025,0.443547
recall,0.0,0.142857,0.0,0.0,0.0,1.0,0.416667,0.888889,0.0,0.2,...,0.583333,0.0,0.0,0.0,0.0,0.0,0.0,0.384615,0.179541,0.384615
f1-score,0.0,0.153846,0.0,0.0,0.0,0.25,0.555556,0.615385,0.0,0.285714,...,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.384615,0.14225,0.367087
support,2.0,7.0,1.0,2.0,1.0,3.0,12.0,9.0,0.0,5.0,...,12.0,2.0,1.0,2.0,1.0,4.0,0.0,0.384615,65.0,65.0


In [62]:
pd.Series(dict(zip(tfidf.get_feature_names(), tree.feature_importances_))).sort_values(ascending=False).head(20)

npm                0.206330
python             0.177057
java               0.130670
latest             0.098097
want               0.075636
code               0.044724
list               0.042458
im                 0.041412
br                 0.039007
readme             0.032695
build              0.029110
project            0.028490
auxiliary          0.023324
version            0.016475
library            0.014516
fish               0.000000
fused              0.000000
furnace            0.000000
furnes             0.000000
firstbadversion    0.000000
dtype: float64

In [63]:
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


---
**What words are appearing most frequently depending on language?**

---

### Stratification Detour

In [64]:
# df.shape

In [65]:
# df.language.value_counts() >= 2

In [66]:
# df.groupby('language').filter(lambda x : len(x) >= 2)

In [67]:
# df.groupby("language").language.agg(["count"]).sort_values(by="count") 

In [68]:
# df[df.groupby("language").language.transform("count") >= 2]

In [69]:
# df = df[df.groupby("language").language.transform("count") >= 2]

In [70]:
# df[df.language == "ApacheConf"]

**Model #8: TF-IDF Bigrams Logistic Regression**

In [71]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
7,TF-IDF Bigrams Logistic Regression,0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [72]:
print('TF-IDF Bigrams Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Train Accuracy: 61.05%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.361905,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.318095,0.597398
recall,0.0,0.0,0.230769,0.0,0.0,0.0,0.933333,1.0,1.0,0.2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.246777,0.610465
f1-score,0.0,0.0,0.375,0.0,0.0,0.0,0.965517,1.0,0.531469,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.24663,0.534755
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.610465,172.0,172.0


In [73]:
print('TF-IDF Bigrams Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Test Accuracy: 23.26%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.175,0.0,1.0,0.0,0.0,0.0,0.232558,0.18125,0.493605
recall,0.0,0.0,0.0,0.0,0.0,0.2,1.0,0.0,0.1,0.0,0.0,0.0,0.232558,0.108333,0.232558
f1-score,0.0,0.0,0.0,0.0,0.0,0.333333,0.297872,0.0,0.181818,0.0,0.0,0.0,0.232558,0.067752,0.168294
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.232558,43.0,43.0


**Model #9: TF-IDF Unigrams & Bigrams Logistics Regression**

In [74]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigram & Bigram Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
8,TF-IDF Unigram & Bigram Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
7,TF-IDF Bigrams Logistic Regression,0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [75]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: 64.53%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.383838,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.319192,0.602243
recall,0.0,0.0,0.615385,0.0,0.0,0.0,0.8,1.0,1.0,0.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.274341,0.645349
f1-score,0.0,0.0,0.761905,0.0,0.0,0.0,0.888889,1.0,0.554745,0.666667,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.279974,0.581837
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.645349,172.0,172.0


In [76]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: 32.56%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.205882,0.0,0.5,0.0,0.0,0.0,0.325581,0.142157,0.382353
recall,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.2,0.0,0.0,0.0,0.325581,0.141667,0.325581
f1-score,0.0,0.0,0.0,0.0,0.0,0.666667,0.341463,0.0,0.285714,0.0,0.0,0.0,0.325581,0.10782,0.277071
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.325581,43.0,43.0


**Model #10: Naive Bayes**

In [77]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Naive Bayes", model_object=nb, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
8,TF-IDF Unigram & Bigram Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
7,TF-IDF Bigrams Logistic Regression,0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
9,TF-IDF Naive Bayes,0.482558
2,"CV Bigrams Gini, 5",0.418605
3,"TF-IDF Bigrams Gini, 5",0.412791


In [78]:
print('TF-IDF Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Naive Bayes Model Train Accuracy: 48.26%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.299213,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.482558,0.264961,0.542849
recall,0.0,0.0,0.153846,0.0,0.0,0.0,0.266667,1.0,1.0,0.3,...,0.578947,0.0,0.0,0.0,0.0,0.0,0.0,0.482558,0.164973,0.482558
f1-score,0.0,0.0,0.266667,0.0,0.0,0.0,0.421053,1.0,0.460606,0.461538,...,0.733333,0.0,0.0,0.0,0.0,0.0,0.0,0.482558,0.16716,0.411827
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.482558,172.0,172.0


In [79]:
print('TF-IDF Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Naive Bayes Model Test Accuracy: 30.23%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.189189,0.0,1.0,0.0,0.0,0.0,0.302326,0.182432,0.495915
recall,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.1,0.0,0.0,0.0,0.302326,0.133333,0.302326
f1-score,0.0,0.0,0.0,0.0,0.0,0.666667,0.318182,0.0,0.181818,0.0,0.0,0.0,0.302326,0.097222,0.249119
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.302326,43.0,43.0


**Model #11: TF-IDF Bigrams Naive Bayes**

In [80]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Naive Bayes", model_object=nb, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.66
5,TF-IDF Logistic Regression,0.645349
8,TF-IDF Unigram & Bigram Logistic Regression,0.645349
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.610465
7,TF-IDF Bigrams Logistic Regression,0.610465
1,"TF-IDF Gini, 5",0.598837
0,"CV Gini, 5",0.575581
10,TF-IDF Bigrams Naive Bayes,0.569767
9,TF-IDF Naive Bayes,0.482558
2,"CV Bigrams Gini, 5",0.418605


In [81]:
nb.predict(X_test)

array(['JavaScript', 'JavaScript', 'Python', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'Java', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'Java', 'JavaScript', 'JavaScript'],
      dtype='<U16')

In [82]:
print('TF-IDF Bigrams Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Train Accuracy: 56.98%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.339286,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.569767,0.266964,0.551703
recall,0.0,0.0,0.230769,0.0,0.0,0.0,0.733333,1.0,1.0,0.2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.569767,0.208205,0.569767
f1-score,0.0,0.0,0.375,0.0,0.0,0.0,0.846154,1.0,0.506667,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.569767,0.203058,0.489267
support,3.0,2.0,13.0,5.0,7.0,2.0,15.0,25.0,38.0,10.0,...,19.0,2.0,6.0,2.0,2.0,7.0,3.0,0.569767,172.0,172.0


In [83]:
print('TF-IDF Bigrams Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Test Accuracy: 23.26%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.175,0.0,1.0,0.0,0.0,0.0,0.232558,0.18125,0.493605
recall,0.0,0.0,0.0,0.0,0.0,0.2,1.0,0.0,0.1,0.0,0.0,0.0,0.232558,0.108333,0.232558
f1-score,0.0,0.0,0.0,0.0,0.0,0.333333,0.297872,0.0,0.181818,0.0,0.0,0.0,0.232558,0.067752,0.168294
support,2.0,5.0,1.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,1.0,0.232558,43.0,43.0


---
# Write a Function

In [84]:
def create_vectorizer_features_and_target():
    """
    This function does the following:
    1. Wrangles README data into a DataFrame
    2. Creates a tfidf vectorizer object
    3. Fits and transforms the clean_readme_contents Series from the df to create features
    4. Establishes a model target
    5. Splits the features (X) and target (y) into train and test
    6. Returns the vectorizer, train features, and train target to be used to fit the model
    """
    

    # wrangle data to train model
    df = pr.wrangle_readme_data()

    # create tfidf vectorizer object
    tfidf = TfidfVectorizer()

    # use tfidf object to create model features
    X = tfidf.fit_transform(df.clean_readme_contents)

    # establish model target
    y = df.language

    # split data
    X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

    return tfidf, X_train, y_train

In [85]:
vectorizer, features, target = create_vectorizer_features_and_target()
print(vectorizer)
features

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


<172x21344 sparse matrix of type '<class 'numpy.float64'>'
	with 58152 stored elements in Compressed Sparse Row format>

In [86]:
target

193           C++
42         Python
210    TypeScript
70           HTML
21     JavaScript
          ...    
122        Python
162          Java
192          Ruby
143          Ruby
85     JavaScript
Name: language, Length: 172, dtype: object

In [87]:
def create_and_fit_model(features, target):
    """
    This function does the following:
    1. Creates model object
    2. Fits model on features and target
    3. Returns model object
    """    

    # create model object
    tree = DecisionTreeClassifier(max_depth=5, random_state=56)

    # fit model object
    tree.fit(features, target)
    
    # return model object to be used later to predict
    return tree

In [88]:
model = create_and_fit_model(features, target)
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=56, splitter='best')

In [89]:
def predict_language(string: str) -> str:
    """
    This function does the following:
    1. Calls create_vectorizer_features_and_target function to get data to fit model
    and vectorizer to later transform input of string argument.
    2. Calls create_and_fit_model to get model that will predict language of string input
    3. Prepares the input of the string argument, the text of a README file, for modeling by calling the following
    functions:
        a. pr.basic_clean
        b. pr.tokenize
        c. pr.lemmatize
        d. pr.remove_stopwords + additional_stopwords
    4. Creates features (X) out of the string_sans_stopwords variable using the tfidf vectorizer object
    to transform
    5. Predicts the language of the features (X)
    6. Index numpy.ndarray object for string of predicted_language
    7. Returns the language_as_string variable
    """
    
    # call create_vectorizer_features_and_target function to get data to fit model
    tfidf, features, target = create_vectorizer_features_and_target()

    # call fit_model to get model that will predict language of string input
    model = create_and_fit_model(features, target)
    
    # call pr.basic_clean
    string = pr.basic_clean(string)

    # call pr.tokenize
    list_of_tokens = pr.tokenize(string)

    # call pr.lemmatize
    list_of_lemmas = pr.lemmatize(list_of_tokens)

    # additional_stopwords variable
    additional_stopwords = ["img", "1", "yes", "see", "width20", "height20", "okay_icon", "unknown"]

    # call pr.remove_stopwords
    lemmas_sans_stopwords, string_sans_stopwords = pr.remove_stopwords(list_of_lemmas, extra_stopwords=additional_stopwords, exclude_stopwords=[])

    # create X variable for model as vectorized string_sans_stopwords
    X = tfidf.transform([string_sans_stopwords])
    
    # predict language of README
    predicted_language = model.predict(X)    
    
    # index numpy.ndarray object for string of language
    language_as_string = predicted_language[0]

    return language_as_string

In [90]:
language = predict_language("""GitHub Natural Language Processing Project
Purpose
This repository holds all resources used in the attainment of the goals established for the GitHub Natural Language Processing Project.

Goals
Build a model that can predict the programming language of a repository given the text data of the accompanying README file.

Data
Repository data scraped from GitHub.

Data Dictionary
repo: the name of the GitHub repository
language: the primary language the GitHub repository
readme_contents: the original contents of the README file
clean_readme_contents: the cleaned contents of the README file used in analysis and modeling
len_of_clean_readme_contents: length of the clean lemmas in the clean_readme_contents feature
Audience
The audience for this project is the layperson.

Deliverables
Need to Haves:
Model
A well-documented jupyter notebook that contains our analysis
Presentation summarizing our findings
Nice to Haves:
GUI for model
Cloning
All files necessary for cloning and reproducing the work found in the final_project.ipynb file are contained within this repository.""")

In [91]:
language

'HTML'

In [92]:
print(language)

HTML


In [93]:
type(language)

str

In [94]:
len("Pneumonoultramicroscopicsilicovolcanoconiosis")

45