# Modeling

In [1]:
import numpy as np
import pandas as pd

import unicodedata

import re

import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import acquire as ac
import prepare as pr
import preprocessing as pp
import evaluate as ev

---
## Wrangle

In [2]:
df = pr.wrangle_readme_data()
df

Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,345
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,74
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center src http www tensorflow org i...,1432
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public github repo official list s...,62
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",p align center href http getbootstrap com src ...,1360
...,...,...,...,...,...
211,akveo/ngx-admin,TypeScript,"# ngx-admin [<img src=""https://i.imgur.com/oMc...",ngx admin src http imgur com omcxwz png alt ev...,557
212,swirldev/swirl_courses,R,# swirl courses\n\nThis is a collection of int...,swirl course collection interactive course use...,475
213,jrowberg/i2cdevlib,C++,Jennic platform added!\n\n====================...,jennic platform added note detail project plea...,267
214,etcd-io/etcd,Go,# etcd\n\n[![Go Report Card](https://goreportc...,etcd go report card http goreportcard com badg...,1034


In [3]:
df.columns

Index(['repo', 'language', 'readme_contents', 'clean_readme_contents',
       'len_of_clean_readme_contents'],
      dtype='object')

In [4]:
df.repo.value_counts().head(18)

axios/axios                                                                    2
fengdu78/Coursera-ML-AndrewNg-Notes                                            2
rust-lang/rust                                                                 2
CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers    2
tensorflow/models                                                              2
nightscout/cgm-remote-monitor                                                  2
arduino/Arduino                                                                2
crossoverJie/JCSprout                                                          2
soimort/you-get                                                                2
twbs/bootstrap                                                                 2
tensorflow/tensorflow                                                          2
apache/incubator-mxnet                                                         2
SmartThingsCommunity/SmartTh

In [5]:
df.repo.value_counts().tail()

square/retrofit                          1
scikit-learn/scikit-learn                1
zxing/zxing                              1
ArduPilot/ardupilot                      1
academicpages/academicpages.github.io    1
Name: repo, dtype: int64

In [6]:
len(df.repo.unique())

199

In [7]:
# stratification base on language using train_test_split won't work unless we have more than one observation
# per language
df.language.value_counts()

JavaScript          46
Java                35
Python              29
C++                 18
HTML                16
Jupyter Notebook    10
PHP                  9
Go                   9
Ruby                 7
TypeScript           7
CSS                  6
C                    5
Vue                  3
R                    3
Rust                 3
PowerShell           2
Shell                2
Kotlin               2
Groovy               2
C#                   2
Name: language, dtype: int64

---
## Model

---
### Outlier Detour

In [8]:
# lemmas
# list_of_readmes = df.clean_readme_contents.tolist()
# list_of_readmes

In [9]:
# for index in range(len(list_of_readmes)):
#     list_of_readmes[index] = list_of_readmes[index].split()
    

# print(list_of_readmes)

In [10]:
# list_of_list_of_lemmas = [lemma.split() for lemmas in list_of_lemmas]
# list_of_list_of_lemmas

In [11]:
# len(list_of_readmes[0])

In [12]:
# len(list_of_readmes[-1])

In [13]:
# len_of_readmes = [len(readme) for readme in list_of_readmes]
# len_of_readmes

In [14]:
# len_of_readmes.sort()

In [15]:
# print(len_of_readmes)

**Cutoff for length of words in readme = 10**

In [16]:
index_three_list = df[df.index == 3].clean_readme_contents.values.tolist()
index_three_list

['smartthings public github repo official list smartapps device type smartthings link help get started coding right away github specific documentation http doc smartthings com en latest tool ide github integration html full documentation http doc smartthings com ide simulator http ide smartthings com community forum http community smartthings com follow u web twitter http twitter com smartthingsdev facebook http facebook com smartthingsdevelopers']

In [17]:
df[df.index == 3].len_of_clean_readme_contents

3    62
Name: len_of_clean_readme_contents, dtype: int64

**len_of_clean_readme_contents seems to be working**

In [18]:
df.len_of_clean_readme_contents.value_counts().sort_index()

10       1
13       2
14       1
23       1
34       1
        ..
9148     1
9735     1
11321    1
17629    1
29114    1
Name: len_of_clean_readme_contents, Length: 188, dtype: int64

In [19]:
df.shape

(216, 5)

**225 observations before outlier removal; should have 219 after**

**CHECK**

---
### Back on Model Track

#### Train/Test Split

In [20]:
df = pr.wrangle_readme_data()
print(df.shape)
df.head()

(216, 5)


Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,345
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,74
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center src http www tensorflow org i...,1432
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public github repo official list s...,62
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",p align center href http getbootstrap com src ...,1360


## Modeling

Transform your documents into a form that can be used in a machine learning model. You should use the programming language of the repository as the label to predict.

Try fitting several different models and using several different representations of the text (e.g. a simple bag of words, then also the TF-IDF values for each).

Build a function that will take in the text of a README file, and tries to predict the programming language.

**CountVectorizer**

In [21]:
# create cv object
cv = CountVectorizer()

In [22]:
# fit and use the cv object
cv_bag_of_words = cv.fit_transform(df.clean_readme_contents)

In [23]:
cv_bag_of_words

<216x21354 sparse matrix of type '<class 'numpy.int64'>'
	with 73203 stored elements in Compressed Sparse Row format>

**TF-IDF**

In [24]:
# create tfidf vectorizer object
tfidf = TfidfVectorizer()

In [25]:
tfidf_bag_of_words = tfidf.fit_transform(df.clean_readme_contents)

In [26]:
tfidf_bag_of_words

<216x21354 sparse matrix of type '<class 'numpy.float64'>'
	with 73203 stored elements in Compressed Sparse Row format>

**CountVectorizer Bag of Bigrams**

In [27]:
# create cv for bigrams
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

In [28]:
cv_bag_of_bigrams = cv_bigrams.fit_transform(df.clean_readme_contents)

In [29]:
cv_bag_of_bigrams

<216x136949 sparse matrix of type '<class 'numpy.int64'>'
	with 175281 stored elements in Compressed Sparse Row format>

**TF-IDF Bag of Bigrams**

In [30]:
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

In [31]:
tfidf_bag_of_bigrams = tfidf_bigrams.fit_transform(df.clean_readme_contents)

In [32]:
tfidf_bag_of_bigrams

<216x136949 sparse matrix of type '<class 'numpy.float64'>'
	with 175281 stored elements in Compressed Sparse Row format>

___

**Model #1: Standard CV**

In [33]:
# establish features and target for CountVectorizer model
X = cv.fit_transform(df.clean_readme_contents)
y = df.language

In [34]:
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

In [35]:
print(X_train.shape)
print(y_train.shape)
print(X_train.shape[0]/df.shape[0])
print(y_train.shape[0]/df.shape[0])

(172, 21354)
(172,)
0.7962962962962963
0.7962962962962963


In [36]:
print(X_test.shape)
print(y_test.shape)
print(X_test.shape[0]/df.shape[0])
print(y_test.shape[0]/df.shape[0])

(44, 21354)
(44,)
0.2037037037037037
0.2037037037037037


In [37]:
pd.DataFrame(X_train[:5, :].todense(), columns=cv.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaaaaac,aaaaaaaaaai,aaaaaaaad,aaaaaaaaecm,aaaaaaaaerc,aaaaaaaaex,aab,aac,...,zxingorg,zybpzd,zygmuntz,zyiot,zyiz,zynga,zypper,zyszys,zz,zzm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# create evaluation DataFrame
evaluation = pd.DataFrame(columns=["model_type", "accuracy"])

In [39]:
# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
0,"CV Gini, 5",0.569767


In [40]:
print('CV DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV DT Model Train Accuracy: 56.98%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.833333,0.0,0.174419,1.0,0.965517,1.0,...,0.909091,0.0,0.0,0.0,0.0,1.0,1.0,0.569767,0.444118,0.717311
recall,0.0,0.0,0.307692,0.0,0.714286,0.0,1.0,0.8,0.717949,0.8,...,0.526316,0.0,0.0,0.0,0.0,0.571429,1.0,0.569767,0.346884,0.569767
f1-score,0.0,0.0,0.470588,0.0,0.769231,0.0,0.29703,0.888889,0.823529,0.888889,...,0.666667,0.0,0.0,0.0,0.0,0.727273,1.0,0.569767,0.359938,0.588822
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.569767,172.0,172.0


In [41]:
print('CV DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV DT Model Test Accuracy: 29.55%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.888889,0.666667,0.0,0.25,0.0,0.0,0.0,0.295455,0.150463,0.364899
recall,0.0,0.0,0.0,0.0,0.0,0.8,0.571429,0.0,0.1,0.0,0.0,0.0,0.295455,0.122619,0.295455
f1-score,0.0,0.0,0.0,0.0,0.0,0.842105,0.615385,0.0,0.142857,0.0,0.0,0.0,0.295455,0.133362,0.321757
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.0,0.295455,44.0,44.0


**Model #2: TF-IDF**

In [42]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.587209
0,"CV Gini, 5",0.569767


In [43]:
X

<216x21354 sparse matrix of type '<class 'numpy.float64'>'
	with 73203 stored elements in Compressed Sparse Row format>

In [44]:
print('TF-IDF DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF DT Model Train Accuracy: 58.72%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,1.0,0.0,0.180723,1.0,1.0,1.0,...,0.8125,0.0,0.0,0.0,0.0,1.0,1.0,0.587209,0.399661,0.710165
recall,0.0,0.0,0.307692,0.0,0.714286,0.0,1.0,0.8,0.692308,0.9,...,0.684211,0.0,0.0,0.0,0.0,0.714286,1.0,0.587209,0.340639,0.587209
f1-score,0.0,0.0,0.470588,0.0,0.833333,0.0,0.306122,0.888889,0.818182,0.947368,...,0.742857,0.0,0.0,0.0,0.0,0.833333,1.0,0.587209,0.342034,0.599392
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.587209,172.0,172.0


In [45]:
print('TF-IDF DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF DT Model Test Accuracy: 38.64%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Rust,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,1.0,0.0,0.875,0.833333,0.0,0.0,0.428571,0.0,0.0,0.0,0.386364,0.2413,0.474297
recall,0.0,0.0,0.0,1.0,0.0,0.7,0.714286,0.0,0.0,0.3,0.0,0.0,0.0,0.386364,0.208791,0.386364
f1-score,0.0,0.0,0.0,1.0,0.0,0.777778,0.769231,0.0,0.0,0.352941,0.0,0.0,0.0,0.386364,0.223073,0.424814
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,0.0,2.0,10.0,1.0,1.0,0.0,0.386364,44.0,44.0


**Model #3: CV Bigrams**

In [46]:
# establish features and target for cv model

# create cv_bigrams vectorizer object
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = cv_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.587209
0,"CV Gini, 5",0.569767
2,"CV Bigrams Gini, 5",0.412791


In [47]:
print('CV Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Train Accuracy: 41.28%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.5,0.0,0.140187,0.923077,0.8,0.875,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.412791,0.311913,0.625754
recall,0.0,0.0,0.153846,0.0,0.142857,0.0,1.0,0.48,0.615385,0.7,...,0.473684,0.0,0.0,0.0,0.0,0.142857,0.0,0.412791,0.185431,0.412791
f1-score,0.0,0.0,0.266667,0.0,0.222222,0.0,0.245902,0.631579,0.695652,0.777778,...,0.642857,0.0,0.0,0.0,0.0,0.25,0.0,0.412791,0.186633,0.426586
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.412791,172.0,172.0


In [48]:
print('CV Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Test Accuracy: 22.73%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.035714,0.333333,0.714286,0.0,0.5,0.0,0.0,0.227273,0.143939,0.303842
recall,0.0,0.0,0.0,0.0,1.0,0.1,0.714286,0.0,0.3,0.0,0.0,0.227273,0.192208,0.227273
f1-score,0.0,0.0,0.0,0.0,0.068966,0.153846,0.714286,0.0,0.375,0.0,0.0,0.227273,0.119282,0.235396
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.227273,44.0,44.0


**Model #4: TF-IDF Bigrams**

In [49]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.587209
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [50]:
print('TF-IDF Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Train Accuracy: 44.77%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.287879,0.875,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.447674,0.358144,0.540566
recall,0.0,0.0,0.153846,0.0,0.714286,0.0,0.0,0.48,0.974359,0.7,...,0.578947,0.0,0.0,0.0,0.0,0.142857,0.0,0.447674,0.212215,0.447674
f1-score,0.0,0.0,0.266667,0.0,0.833333,0.0,0.0,0.648649,0.444444,0.777778,...,0.733333,0.0,0.0,0.0,0.0,0.25,0.0,0.447674,0.231044,0.393279
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.447674,172.0,172.0


In [51]:
print('TF-IDF Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Test Accuracy: 25.00%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.333333,0.2,0.0,0.5,0.0,0.0,0.25,0.093939,0.221212
recall,0.0,0.0,0.0,0.0,0.0,0.1,1.0,0.0,0.3,0.0,0.0,0.25,0.127273,0.25
f1-score,0.0,0.0,0.0,0.0,0.0,0.153846,0.333333,0.0,0.375,0.0,0.0,0.25,0.07838,0.173223
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.25,44.0,44.0


**Model #5: TF-IDF Unigrams & Bigrams**

In [52]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_unigrams_and_bigrams = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf_unigrams_and_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigrams & Bigrams Gini, 5", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [53]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: 58.14%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,1.0,0.0,0.178571,1.0,1.0,1.0,...,0.8125,0.0,0.0,0.0,0.0,1.0,1.0,0.581395,0.449554,0.721605
recall,0.0,0.0,0.307692,0.0,0.714286,0.0,1.0,0.8,0.692308,0.8,...,0.684211,0.0,0.0,0.0,0.0,0.571429,1.0,0.581395,0.353496,0.581395
f1-score,0.0,0.0,0.470588,0.0,0.833333,0.0,0.30303,0.888889,0.818182,0.888889,...,0.742857,0.0,0.0,0.0,0.0,0.727273,1.0,0.581395,0.366985,0.599158
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.581395,172.0,172.0


In [54]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: 25.00%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,Jupyter Notebook,Kotlin,PHP,Python,R,Rust,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.6,0.833333,0.0,0.0,0.0,0.428571,0.0,0.0,0.0,0.25,0.132993,0.366342
recall,0.0,0.0,0.0,0.0,0.0,0.3,0.714286,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.25,0.093878,0.25
f1-score,0.0,0.0,0.0,0.0,0.0,0.4,0.769231,0.0,0.0,0.0,0.352941,0.0,0.0,0.0,0.25,0.108727,0.293501
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,0.0,0.0,2.0,10.0,1.0,1.0,0.0,0.25,44.0,44.0


---

**Create a function to append evaluation metrics (accuracy) to evaluation dataframe**


**CHECK**

---

**Model #6: TF-IDF Logistic Regression**

In [55]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
5,TF-IDF Logistic Regression,0.639535
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [56]:
print('TF-IDF Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Logistic Regression Model Train Accuracy: 63.95%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.925926,0.397959,1.0,...,0.95,0.0,0.0,0.0,0.0,0.0,0.0,0.639535,0.313694,0.591387
recall,0.0,0.0,0.615385,0.0,0.0,0.0,0.6,1.0,1.0,0.6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.639535,0.269341,0.639535
f1-score,0.0,0.0,0.761905,0.0,0.0,0.0,0.75,0.961538,0.569343,0.75,...,0.974359,0.0,0.0,0.0,0.0,0.0,0.0,0.639535,0.274721,0.572682
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.639535,172.0,172.0


In [57]:
print('TF-IDF Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Logistic Regression Model Test Accuracy: 36.36%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.212121,0.0,0.666667,0.0,0.0,0.363636,0.170799,0.412534
recall,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.4,0.0,0.0,0.363636,0.172727,0.363636
f1-score,0.0,0.0,0.0,0.0,0.0,0.666667,0.35,0.0,0.5,0.0,0.0,0.363636,0.137879,0.320833
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.363636,44.0,44.0


In [58]:
# go_df = df[df.language == "Go"]
# go_df.to_dict("response")

**Model #7: TF-IDF Hyperparameters**

In [59]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y, train_size=.7, test_size=.3)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=2,random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Entropy, 5, 2, Train 70%", model_object=tree, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [60]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: 64.24%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.333333,0.0,0.0,0.0,0.448276,1.0,0.72973,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.642384,0.3714,0.624134
recall,0.0,0.0,0.769231,0.0,0.0,0.0,1.0,0.782609,0.72973,1.0,...,0.6,0.0,0.0,0.0,0.0,0.5,1.0,0.642384,0.394078,0.642384
f1-score,0.0,0.0,0.465116,0.0,0.0,0.0,0.619048,0.878049,0.72973,1.0,...,0.75,0.0,0.0,0.0,0.0,0.666667,1.0,0.642384,0.359842,0.603623
support,3.0,2.0,13.0,3.0,7.0,1.0,13.0,23.0,37.0,9.0,...,15.0,1.0,7.0,1.0,1.0,4.0,3.0,0.642384,151.0,151.0


In [61]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: 27.69%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,PHP,PowerShell,Python,R,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.076923,0.857143,0.6,0.0,0.4,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.276923,0.157886,0.431023
recall,0.0,0.0,0.0,0.0,0.0,0.333333,0.5,0.666667,0.0,0.5,0.0,0.214286,0.0,0.0,0.0,0.0,0.0,0.276923,0.130252,0.276923
f1-score,0.0,0.0,0.0,0.0,0.0,0.125,0.631579,0.631579,0.0,0.444444,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.276923,0.127408,0.308963
support,2.0,5.0,3.0,2.0,1.0,3.0,12.0,9.0,1.0,4.0,1.0,14.0,2.0,2.0,1.0,3.0,0.0,0.276923,65.0,65.0


In [62]:
pd.Series(dict(zip(tfidf.get_feature_names(), tree.feature_importances_))).sort_values(ascending=False).head(20)

npm            0.204118
python         0.175850
java           0.124713
please         0.094679
new            0.068430
security       0.066382
br             0.054278
list           0.039029
im             0.036884
readme         0.030074
save           0.028209
disk           0.022629
require        0.020684
reference      0.014600
key            0.014600
doc            0.004841
furukawa       0.000000
fury           0.000000
fupdate        0.000000
furthermore    0.000000
dtype: float64

In [63]:
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


---
**What words are appearing most frequently depending on language?**

---

### Stratification Detour

In [64]:
# df.shape

In [65]:
# df.language.value_counts() >= 2

In [66]:
# df.groupby('language').filter(lambda x : len(x) >= 2)

In [67]:
# df.groupby("language").language.agg(["count"]).sort_values(by="count") 

In [68]:
# df[df.groupby("language").language.transform("count") >= 2]

In [69]:
# df = df[df.groupby("language").language.transform("count") >= 2]

In [70]:
# df[df.language == "ApacheConf"]

**Model #8: TF-IDF Bigrams Logistic Regression**

In [71]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
7,TF-IDF Bigrams Logistic Regression,0.604651
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [72]:
print('TF-IDF Bigrams Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Train Accuracy: 60.47%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.364486,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604651,0.318224,0.600087
recall,0.0,0.0,0.230769,0.0,0.0,0.0,0.8,1.0,1.0,0.2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604651,0.24011,0.604651
f1-score,0.0,0.0,0.375,0.0,0.0,0.0,0.888889,1.0,0.534247,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604651,0.242937,0.531792
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.604651,172.0,172.0


In [73]:
print('TF-IDF Bigrams Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Test Accuracy: 20.45%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,1.0,0.0,0.0,0.204545,0.106061,0.253788
recall,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.0,0.0,0.204545,0.109091,0.204545
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.333333,0.0,0.0,0.204545,0.056277,0.121212
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.204545,44.0,44.0


**Model #9: TF-IDF Unigrams & Bigrams Logistics Regression**

In [74]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigram & Bigram Logistic Regression", model_object=logit, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
8,TF-IDF Unigram & Bigram Logistic Regression,0.610465
7,TF-IDF Bigrams Logistic Regression,0.604651
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [75]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: 61.05%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.367925,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.318396,0.600867
recall,0.0,0.0,0.307692,0.0,0.0,0.0,0.6,1.0,1.0,0.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.248956,0.610465
f1-score,0.0,0.0,0.470588,0.0,0.0,0.0,0.75,1.0,0.537931,0.666667,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.610465,0.257623,0.547119
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.610465,172.0,172.0


In [76]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: 29.55%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.194444,0.0,0.6,0.0,0.0,0.295455,0.163131,0.394571
recall,0.0,0.0,0.0,0.0,0.0,0.3,1.0,0.0,0.3,0.0,0.0,0.295455,0.145455,0.295455
f1-score,0.0,0.0,0.0,0.0,0.0,0.461538,0.325581,0.0,0.4,0.0,0.0,0.295455,0.10792,0.247601
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.295455,44.0,44.0


**Model #10: Naive Bayes**

In [77]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Naive Bayes", model_object=nb, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
8,TF-IDF Unigram & Bigram Logistic Regression,0.610465
7,TF-IDF Bigrams Logistic Regression,0.604651
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791
9,TF-IDF Naive Bayes,0.412791


In [78]:
print('TF-IDF Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Naive Bayes Model Train Accuracy: 41.28%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.278571,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.412791,0.163929,0.406188
recall,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.96,1.0,0.0,...,0.315789,0.0,0.0,0.0,0.0,0.0,0.0,0.412791,0.120456,0.412791
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.235294,0.979592,0.435754,0.0,...,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.412791,0.106532,0.31473
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.412791,172.0,172.0


In [79]:
print('TF-IDF Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Naive Bayes Model Test Accuracy: 22.73%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,1.0,0.170732,0.0,1.0,0.0,0.0,0.227273,0.197339,0.481707
recall,0.0,0.0,0.0,0.0,0.0,0.2,1.0,0.0,0.1,0.0,0.0,0.227273,0.118182,0.227273
f1-score,0.0,0.0,0.0,0.0,0.0,0.333333,0.291667,0.0,0.181818,0.0,0.0,0.227273,0.073347,0.163481
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.227273,44.0,44.0


**Model #11: TF-IDF Bigrams Naive Bayes**

In [80]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Naive Bayes", model_object=nb, X=X_train, y=y_train)
evaluation.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_type,accuracy
6,"TF-IDF Entropy, 5, 2, Train 70%",0.642384
5,TF-IDF Logistic Regression,0.639535
8,TF-IDF Unigram & Bigram Logistic Regression,0.610465
7,TF-IDF Bigrams Logistic Regression,0.604651
1,"TF-IDF Gini, 5",0.587209
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.581395
0,"CV Gini, 5",0.569767
10,TF-IDF Bigrams Naive Bayes,0.52907
3,"TF-IDF Bigrams Gini, 5",0.447674
2,"CV Bigrams Gini, 5",0.412791


In [81]:
nb.predict(X_test)

array(['JavaScript', 'JavaScript', 'Python', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'Python'], dtype='<U16')

In [82]:
print('TF-IDF Bigrams Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Train Accuracy: 52.91%
---


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.325,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52907,0.26625,0.550436
recall,0.0,0.0,0.153846,0.0,0.0,0.0,0.4,1.0,1.0,0.2,...,0.894737,0.0,0.0,0.0,0.0,0.0,0.0,0.52907,0.182429,0.52907
f1-score,0.0,0.0,0.266667,0.0,0.0,0.0,0.571429,1.0,0.490566,0.333333,...,0.944444,0.0,0.0,0.0,0.0,0.0,0.0,0.52907,0.180322,0.450279
support,3.0,2.0,13.0,3.0,7.0,2.0,15.0,25.0,39.0,10.0,...,19.0,2.0,7.0,2.0,2.0,7.0,3.0,0.52907,172.0,172.0


In [83]:
print('TF-IDF Bigrams Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Test Accuracy: 20.45%
---


Unnamed: 0,C,C++,CSS,Go,HTML,Java,JavaScript,PHP,Python,R,Rust,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,1.0,0.0,0.0,0.204545,0.106061,0.253788
recall,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.0,0.0,0.204545,0.109091,0.204545
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.333333,0.0,0.0,0.204545,0.056277,0.121212
support,2.0,5.0,3.0,2.0,1.0,10.0,7.0,2.0,10.0,1.0,1.0,0.204545,44.0,44.0


---
# Write a Function

In [84]:
def create_vectorizer_features_and_target():
    """
    This function does the following:
    1. Wrangles README data into a DataFrame
    2. Creates a tfidf vectorizer object
    3. Fits and transforms the clean_readme_contents Series from the df to create features
    4. Establishes a model target
    5. Splits the features (X) and target (y) into train and test
    6. Returns the vectorizer, train features, and train target to be used to fit the model
    """
    

    # wrangle data to train model
    df = pr.wrangle_readme_data()

    # create tfidf vectorizer object
    tfidf = TfidfVectorizer()

    # use tfidf object to create model features
    X = tfidf.fit_transform(df.clean_readme_contents)

    # establish model target
    y = df.language

    # split data
    X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

    return tfidf, X_train, y_train

In [85]:
vectorizer, features, target = create_vectorizer_features_and_target()
print(vectorizer)
features

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


<172x21354 sparse matrix of type '<class 'numpy.float64'>'
	with 57939 stored elements in Compressed Sparse Row format>

In [86]:
target

42         Python
123        Python
70           HTML
21     JavaScript
99           Java
          ...    
122    TypeScript
162            Go
192          Java
143        Python
85     JavaScript
Name: language, Length: 172, dtype: object

In [87]:
def create_and_fit_model(features, target):
    """
    This function does the following:
    1. Creates model object
    2. Fits model on features and target
    3. Returns model object
    """    

    # create model object
    tree = DecisionTreeClassifier(max_depth=5, random_state=56)

    # fit model object
    tree.fit(features, target)
    
    # return model object to be used later to predict
    return tree

In [88]:
model = create_and_fit_model(features, target)
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=56, splitter='best')

In [89]:
def predict_language(string: str) -> str:
    """
    This function does the following:
    1. Calls create_vectorizer_features_and_target function to get data to fit model
    and vectorizer to later transform input of string argument.
    2. Calls create_and_fit_model to get model that will predict language of string input
    3. Prepares the input of the string argument, the text of a README file, for modeling by calling the following
    functions:
        a. pr.basic_clean
        b. pr.tokenize
        c. pr.lemmatize
        d. pr.remove_stopwords + additional_stopwords
    4. Creates features (X) out of the string_sans_stopwords variable using the tfidf vectorizer object
    to transform
    5. Predicts the language of the features (X)
    6. Index numpy.ndarray object for string of predicted_language
    7. Returns the language_as_string variable
    """
    
    # call create_vectorizer_features_and_target function to get data to fit model
    tfidf, features, target = create_vectorizer_features_and_target()

    # call fit_model to get model that will predict language of string input
    model = create_and_fit_model(features, target)
    
    # call pr.basic_clean
    string = pr.basic_clean(string)

    # call pr.tokenize
    list_of_tokens = pr.tokenize(string)

    # call pr.lemmatize
    list_of_lemmas = pr.lemmatize(list_of_tokens)

    # additional_stopwords variable
    additional_stopwords = ["img", "1", "yes", "see", "width20", "height20", "okay_icon", "unknown"]

    # call pr.remove_stopwords
    lemmas_sans_stopwords, string_sans_stopwords = pr.remove_stopwords(list_of_lemmas, extra_stopwords=additional_stopwords, exclude_stopwords=[])

    # create X variable for model as vectorized string_sans_stopwords
    X = tfidf.transform([string_sans_stopwords])
    
    # predict language of README
    predicted_language = model.predict(X)    
    
    # index numpy.ndarray object for string of language
    language_as_string = predicted_language[0]

    return language_as_string

In [90]:
language = predict_language("""GitHub Natural Language Processing Project
Purpose
This repository holds all resources used in the attainment of the goals established for the GitHub Natural Language Processing Project.

Goals
Build a model that can predict the programming language of a repository given the text data of the accompanying README file.

Data
Repository data scraped from GitHub.

Data Dictionary
repo: the name of the GitHub repository
language: the primary language the GitHub repository
readme_contents: the original contents of the README file
clean_readme_contents: the cleaned contents of the README file used in analysis and modeling
len_of_clean_readme_contents: length of the clean lemmas in the clean_readme_contents feature
Audience
The audience for this project is the layperson.

Deliverables
Need to Haves:
Model
A well-documented jupyter notebook that contains our analysis
Presentation summarizing our findings
Nice to Haves:
GUI for model
Cloning
All files necessary for cloning and reproducing the work found in the final_project.ipynb file are contained within this repository.""")

In [91]:
language

'HTML'

In [92]:
print(language)

HTML


In [93]:
type(language)

str