# Modeling

In [1]:
import numpy as np
import pandas as pd

import unicodedata

import re

import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import acquire as ac
import prepare as pr
import preprocessing as pp
import evaluate as ev
import model as mo

import warnings
warnings.filterwarnings("ignore")

---
## Wrangle

In [2]:
df = pr.wrangle_readme_data()
df

Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,316
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,66
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center tensorflow image tf logo soci...,1138
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public repo official list smartapp...,44
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",align center href getbootstrap getbootstrap as...,1048
...,...,...,...,...,...
221,akveo/ngx-admin,TypeScript,"# ngx-admin [<img src=""https://i.imgur.com/oMc...",ngx admin imgur omcxwz png alt eva design syst...,434
222,swirldev/swirl_courses,R,# swirl courses\n\nThis is a collection of int...,swirl course collection interactive course use...,420
223,jrowberg/i2cdevlib,C++,Jennic platform added!\n\n====================...,jennic platform added note detail project plea...,243
224,etcd-io/etcd,Go,# etcd\n\n[![Go Report Card](https://goreportc...,etcd go report card goreportcard badge etcd io...,882


In [3]:
df.columns

Index(['repo', 'language', 'readme_contents', 'clean_readme_contents',
       'len_of_clean_readme_contents'],
      dtype='object')

In [4]:
df.repo.value_counts().head(18)

octocat/Spoon-Knife                       2
tensorflow/tensorflow                     2
twbs/bootstrap                            2
rdpeng/ProgrammingAssignment2             2
SmartThingsCommunity/SmartThingsPublic    2
tensorflow/models                         2
nightscout/cgm-remote-monitor             2
swirldev/swirl_courses                    1
vuejs/vue                                 1
ityouknow/spring-boot-examples            1
WordPress/WordPress                       1
airbnb/javascript                         1
scikit-learn/scikit-learn                 1
adobe/brackets                            1
angular-ui/bootstrap                      1
donnemartin/system-design-primer          1
udacity/fullstack-nanodegree-vm           1
testerSunshine/12306                      1
Name: repo, dtype: int64

In [5]:
df.repo.value_counts().tail()

julycoding/The-Art-Of-Programming-By-July    1
angular/angular-seed                         1
webpack/webpack                              1
CyC2018/CS-Notes                             1
lenve/vhr                                    1
Name: repo, dtype: int64

In [6]:
len(df.repo.unique())

219

In [7]:
# stratification base on language using train_test_split won't work unless we have more than one observation
# per language
df.language.value_counts()

JavaScript          51
Java                36
Python              27
C++                 16
HTML                14
Jupyter Notebook    12
Go                  10
C                   10
TypeScript           9
PHP                  9
Ruby                 7
CSS                  5
Shell                4
Vue                  3
R                    3
Rust                 2
Kotlin               2
C#                   2
Groovy               2
PowerShell           2
Name: language, dtype: int64

---
## Model

---
### Outlier Detour

In [8]:
# lemmas
# list_of_readmes = df.clean_readme_contents.tolist()
# list_of_readmes

In [9]:
# for index in range(len(list_of_readmes)):
#     list_of_readmes[index] = list_of_readmes[index].split()
    

# print(list_of_readmes)

In [10]:
# list_of_list_of_lemmas = [lemma.split() for lemmas in list_of_lemmas]
# list_of_list_of_lemmas

In [11]:
# len(list_of_readmes[0])

In [12]:
# len(list_of_readmes[-1])

In [13]:
# len_of_readmes = [len(readme) for readme in list_of_readmes]
# len_of_readmes

In [14]:
# len_of_readmes.sort()

In [15]:
# print(len_of_readmes)

**Cutoff for length of words in readme = 10**

In [16]:
index_three_list = df[df.index == 3].clean_readme_contents.values.tolist()
index_three_list

['smartthings public repo official list smartapps device type smartthings link help get started coding right away specific documentation smartthings en latest tool ide integration html full documentation smartthings ide simulator ide smartthings community forum community smartthings follow web twitter twitter smartthingsdev facebook facebook smartthingsdevelopers']

In [17]:
df[df.index == 3].len_of_clean_readme_contents

3    44
Name: len_of_clean_readme_contents, dtype: int64

**len_of_clean_readme_contents seems to be working**

In [18]:
df.len_of_clean_readme_contents.value_counts().sort_index()

12       2
13       1
18       1
19       1
22       1
        ..
7612     1
8042     1
10071    1
14505    1
22628    1
Name: len_of_clean_readme_contents, Length: 200, dtype: int64

In [19]:
df.shape

(226, 5)

**225 observations before outlier removal; should have 219 after**

**CHECK**

---
### Back on Model Track

#### Train/Test Split

In [20]:
df = pr.wrangle_readme_data()
print(df.shape)
df.head()

(226, 5)


Unnamed: 0,repo,language,readme_contents,clean_readme_contents,len_of_clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...,316
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...,66
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div align center tensorflow image tf logo soci...,1138
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public repo official list smartapp...,44
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",align center href getbootstrap getbootstrap as...,1048


## Modeling

Transform your documents into a form that can be used in a machine learning model. You should use the programming language of the repository as the label to predict.

Try fitting several different models and using several different representations of the text (e.g. a simple bag of words, then also the TF-IDF values for each).

Build a function that will take in the text of a README file, and tries to predict the programming language.

**CountVectorizer**

In [21]:
# create cv object
cv = CountVectorizer()

In [22]:
# fit and use the cv object
cv_bag_of_words = cv.fit_transform(df.clean_readme_contents)

In [23]:
cv_bag_of_words

<226x21895 sparse matrix of type '<class 'numpy.int64'>'
	with 73098 stored elements in Compressed Sparse Row format>

**TF-IDF**

In [24]:
# create tfidf vectorizer object
tfidf = TfidfVectorizer()

In [25]:
tfidf_bag_of_words = tfidf.fit_transform(df.clean_readme_contents)

In [26]:
tfidf_bag_of_words

<226x21895 sparse matrix of type '<class 'numpy.float64'>'
	with 73098 stored elements in Compressed Sparse Row format>

**CountVectorizer Bag of Bigrams**

In [27]:
# create cv for bigrams
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

In [28]:
cv_bag_of_bigrams = cv_bigrams.fit_transform(df.clean_readme_contents)

In [29]:
cv_bag_of_bigrams

<226x141760 sparse matrix of type '<class 'numpy.int64'>'
	with 168591 stored elements in Compressed Sparse Row format>

**TF-IDF Bag of Bigrams**

In [30]:
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

In [31]:
tfidf_bag_of_bigrams = tfidf_bigrams.fit_transform(df.clean_readme_contents)

In [32]:
tfidf_bag_of_bigrams

<226x141760 sparse matrix of type '<class 'numpy.float64'>'
	with 168591 stored elements in Compressed Sparse Row format>

___

**Model #1: Standard CV**

In [33]:
# establish features and target for CountVectorizer model
X = cv.fit_transform(df.clean_readme_contents)
y = df.language

In [34]:
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

In [35]:
print(X_train.shape)
print(y_train.shape)
print(X_train.shape[0]/df.shape[0])
print(y_train.shape[0]/df.shape[0])

(180, 21895)
(180,)
0.7964601769911505
0.7964601769911505


In [36]:
print(X_test.shape)
print(y_test.shape)
print(X_test.shape[0]/df.shape[0])
print(y_test.shape[0]/df.shape[0])

(46, 21895)
(46,)
0.20353982300884957
0.20353982300884957


In [37]:
pd.DataFrame(X_train[:5, :].todense(), columns=cv.get_feature_names())

Unnamed: 0,aa,aaaa,aaaaaaaaaac,aaaaaaaaaai,aaaaaaaad,aaaaaaaaecm,aaaaaaaaerc,aaaaaaaaex,aab,aac,...,zxing,zxingobjc,zxingorg,zybpzd,zygmuntz,zynga,zypper,zyszys,zz,zzm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# create evaluation DataFrame
evaluation = pd.DataFrame(columns=["model_type", "train_accuracy", "test_accuracy"])

In [39]:
# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Gini, 5", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
0,"CV Gini, 5",0.588889,0.413043


In [40]:
print('CV DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV DT Model Train Accuracy: 58.89%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.5,0.0,1.0,0.0,0.833333,0.0,1.0,1.0,0.352381,0.818182,...,0.875,0.0,0.0,0.0,0.0,1.0,1.0,0.588889,0.518945,0.689711
recall,0.125,0.0,0.153846,0.0,0.625,0.0,0.090909,0.785714,1.0,0.75,...,0.736842,0.0,0.0,0.0,0.0,0.666667,1.0,0.588889,0.352949,0.588889
f1-score,0.2,0.0,0.266667,0.0,0.714286,0.0,0.166667,0.88,0.521127,0.782609,...,0.8,0.0,0.0,0.0,0.0,0.8,1.0,0.588889,0.378363,0.548969
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.588889,180.0,180.0


In [41]:
print('CV DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV DT Model Test Accuracy: 41.30%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Shell,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.666667,0.423077,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.413043,0.256903,0.3534
recall,0.0,0.0,1.0,0.0,0.5,0.785714,0.0,1.0,0.125,0.0,0.0,0.0,0.0,0.413043,0.262363,0.413043
f1-score,0.0,0.0,1.0,0.0,0.571429,0.55,0.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.413043,0.25293,0.360973
support,2.0,3.0,2.0,3.0,8.0,14.0,0.0,1.0,8.0,1.0,1.0,3.0,0.0,0.413043,46.0,46.0


**Model #2: TF-IDF**

In [42]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Gini, 5", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [43]:
X

<226x21895 sparse matrix of type '<class 'numpy.float64'>'
	with 73098 stored elements in Compressed Sparse Row format>

In [44]:
print('TF-IDF DT Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF DT Model Train Accuracy: 56.67%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.6,0.0,1.0,0.0,1.0,1.0,0.327434,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.566667,0.396372,0.610639
recall,0.0,0.0,0.230769,0.0,0.625,0.0,0.090909,0.821429,1.0,0.833333,...,0.736842,0.0,0.0,0.0,0.0,0.666667,1.0,0.566667,0.300247,0.566667
f1-score,0.0,0.0,0.333333,0.0,0.769231,0.0,0.166667,0.901961,0.493333,0.909091,...,0.848485,0.0,0.0,0.0,0.0,0.8,1.0,0.566667,0.311105,0.516995
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.566667,180.0,180.0


In [45]:
print('TF-IDF DT Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF DT Model Test Accuracy: 36.96%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.5,0.322581,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.369565,0.201613,0.402525
recall,0.0,0.0,1.0,0.0,0.25,0.714286,0.0,0.0,0.375,0.0,0.0,0.0,0.0,0.0,0.369565,0.167092,0.369565
f1-score,0.0,0.0,1.0,0.0,0.333333,0.444444,0.0,0.0,0.545455,0.0,0.0,0.0,0.0,0.0,0.369565,0.165945,0.331577
support,2.0,3.0,2.0,3.0,8.0,14.0,0.0,1.0,8.0,1.0,1.0,3.0,0.0,0.0,0.369565,46.0,46.0


**Model #3: CV Bigrams**

In [46]:
# establish features and target for cv model

# create cv_bigrams vectorizer object
cv_bigrams = CountVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = cv_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="CV Bigrams Gini, 5", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
2,"CV Bigrams Gini, 5",0.4,0.478261
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [47]:
print('CV Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Train Accuracy: 40.00%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.21374,0.814815,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.4,0.351428,0.556294
recall,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,1.0,0.594595,0.083333,...,0.473684,0.0,0.0,0.0,0.0,0.444444,0.666667,0.4,0.195828,0.4
f1-score,0.0,0.0,0.266667,0.0,0.0,0.0,0.0,0.352201,0.6875,0.153846,...,0.642857,0.0,0.0,0.0,0.0,0.615385,0.8,0.4,0.209256,0.367211
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.4,180.0,180.0


In [48]:
print('CV Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

CV Bigrams Gini, 5 Model Test Accuracy: 47.83%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.285714,1.0,0.5,0.5,0.0,0.0,0.0,0.0,0.478261,0.190476,0.451863
recall,0.0,0.0,0.0,0.0,1.0,0.785714,1.0,0.25,0.0,0.0,0.0,0.0,0.478261,0.252976,0.478261
f1-score,0.0,0.0,0.0,0.0,0.444444,0.88,0.666667,0.333333,0.0,0.0,0.0,0.0,0.478261,0.193704,0.417585
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.0,0.478261,46.0,46.0


**Model #4: TF-IDF Bigrams**

In [49]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_bigrams = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Gini, 5", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [50]:
print('TF-IDF Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Train Accuracy: 40.56%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.217054,0.846154,1.0,...,0.933333,0.0,0.0,0.0,0.0,1.0,1.0,0.405556,0.324827,0.525659
recall,0.0,0.0,0.153846,0.2,0.0,0.0,0.0,1.0,0.594595,0.083333,...,0.736842,0.0,0.0,0.0,0.0,0.333333,0.666667,0.405556,0.188431,0.405556
f1-score,0.0,0.0,0.266667,0.285714,0.0,0.0,0.0,0.356688,0.698413,0.153846,...,0.823529,0.0,0.0,0.0,0.0,0.5,0.8,0.405556,0.194243,0.361761
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.405556,180.0,180.0


In [51]:
print('TF-IDF Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Gini, 5 Model Test Accuracy: 50.00%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.285714,1.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.5,0.162698,0.469979
recall,0.0,0.0,0.0,0.0,1.0,0.785714,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.190476,0.5
f1-score,0.0,0.0,0.0,0.0,0.444444,0.88,0.0,0.571429,0.0,0.0,0.0,0.0,0.5,0.157989,0.4445
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.0,0.5,46.0,46.0


**Model #5: TF-IDF Unigrams & Bigrams**

In [52]:
# establish features and target for tfidf model

# create cv_bigrams vectorizer object
tfidf_unigrams_and_bigrams = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf_unigrams_and_bigrams.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(max_depth=5, random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigrams & Bigrams Gini, 5", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [53]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Train Accuracy: 58.89%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.342593,0.909091,...,0.875,0.0,0.0,0.0,0.0,1.0,1.0,0.588889,0.506334,0.712278
recall,0.125,0.0,0.153846,0.0,0.625,0.0,0.090909,0.785714,1.0,0.833333,...,0.736842,0.0,0.0,0.0,0.0,0.666667,1.0,0.588889,0.332116,0.588889
f1-score,0.222222,0.0,0.266667,0.0,0.769231,0.0,0.166667,0.88,0.510345,0.869565,...,0.8,0.0,0.0,0.0,0.0,0.8,1.0,0.588889,0.352696,0.548572
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.588889,180.0,180.0


In [54]:
print('TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigrams & Bigrams Gini, 5 Model Test Accuracy: 43.48%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Shell,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.6,0.448276,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.434783,0.260124,0.363968
recall,0.0,0.0,1.0,0.0,0.375,0.928571,0.0,1.0,0.125,0.0,0.0,0.0,0.0,0.434783,0.263736,0.434783
f1-score,0.0,0.0,1.0,0.0,0.461538,0.604651,0.0,1.0,0.181818,0.0,0.0,0.0,0.0,0.434783,0.249847,0.36113
support,2.0,3.0,2.0,3.0,8.0,14.0,0.0,1.0,8.0,1.0,1.0,3.0,0.0,0.434783,46.0,46.0


---

**Create a function to append evaluation metrics (accuracy) to evaluation dataframe**


**CHECK**

---

**Model #6: TF-IDF Logistic Regression**

In [55]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Logistic Regression", model_object=logit, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [56]:
print('TF-IDF Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Logistic Regression Model Train Accuracy: 60.56%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.965517,0.345794,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.605556,0.315566,0.576827
recall,0.0,0.0,0.692308,0.0,0.0,0.0,0.272727,1.0,1.0,0.916667,...,1.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.605556,0.255196,0.605556
f1-score,0.0,0.0,0.818182,0.0,0.0,0.0,0.428571,0.982456,0.513889,0.956522,...,1.0,0.0,0.0,0.0,0.0,0.363636,0.0,0.605556,0.253163,0.531246
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.605556,180.0,180.0


In [57]:
print('TF-IDF Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Logistic Regression Model Test Accuracy: 45.65%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Shell,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.666667,0.411765,0.0,0.0,0.6,0.0,0.0,0.0,0.456522,0.139869,0.34561
recall,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.375,0.0,0.0,0.0,0.456522,0.15625,0.456522
f1-score,0.0,0.0,0.0,0.0,0.571429,0.583333,0.0,0.0,0.461538,0.0,0.0,0.0,0.456522,0.134692,0.357183
support,2.0,3.0,2.0,3.0,8.0,14.0,0.0,1.0,8.0,1.0,1.0,3.0,0.456522,46.0,46.0


In [58]:
# go_df = df[df.language == "Go"]
# go_df.to_dict("response")

**Model #7: TF-IDF Hyperparameters**

In [59]:
# establish features and target for tfidf model

# create tfidf vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y, train_size=.7, test_size=.3)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
tree = DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=2,random_state=56)

# fit model object
tree.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Entropy, 5, 2, Train 70%", model_object=tree, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


In [60]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Train Accuracy: 59.49%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,PowerShell,Python,R,Ruby,Rust,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.333333,0.0,1.0,0.0,0.0,0.0,0.191489,1.0,0.875,1.0,...,0.0,0.833333,0.0,0.0,0.0,1.0,1.0,0.594937,0.400429,0.686961
recall,0.625,0.0,0.230769,0.0,0.0,0.0,0.9,0.72,0.65625,0.833333,...,0.0,0.9375,0.0,0.0,0.0,0.571429,1.0,0.594937,0.385864,0.594937
f1-score,0.434783,0.0,0.375,0.0,0.0,0.0,0.315789,0.837209,0.75,0.909091,...,0.0,0.882353,0.0,0.0,0.0,0.727273,1.0,0.594937,0.355434,0.589944
support,8.0,2.0,13.0,4.0,7.0,1.0,10.0,25.0,32.0,12.0,...,2.0,16.0,1.0,5.0,1.0,7.0,3.0,0.594937,158.0,158.0


In [61]:
print('TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Entropy, 5, 2, Train 70% Model Test Accuracy: 45.59%
---


Unnamed: 0,C,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,PHP,Python,R,Ruby,Rust,Shell,TypeScript,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.166667,0.75,0.933333,0.0,0.25,0.666667,0.0,0.0,0.0,0.0,0.25,0.455882,0.188542,0.514461
recall,0.0,0.0,0.0,0.0,0.0,0.75,0.545455,0.736842,0.0,0.5,0.545455,0.0,0.0,0.0,0.0,0.5,0.455882,0.223609,0.455882
f1-score,0.0,0.0,0.0,0.0,0.0,0.272727,0.631579,0.823529,0.0,0.333333,0.6,0.0,0.0,0.0,0.0,0.333333,0.455882,0.187156,0.46498
support,2.0,3.0,1.0,3.0,1.0,4.0,11.0,19.0,0.0,2.0,11.0,2.0,2.0,1.0,4.0,2.0,0.455882,68.0,68.0


In [62]:
pd.Series(dict(zip(tfidf.get_feature_names(), tree.feature_importances_))).sort_values(ascending=False).head(20)

python           0.202186
npm              0.152189
java             0.128523
latest           0.108670
security         0.105462
build            0.050528
network          0.046439
preview          0.040917
google           0.036421
cpp              0.030761
article          0.029698
toolchain        0.022056
api              0.016150
open             0.015770
transport        0.014230
funnlp           0.000000
funk             0.000000
function         0.000000
functional       0.000000
functionality    0.000000
dtype: float64

In [63]:
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565


---
**What words are appearing most frequently depending on language?**

---

### Stratification Detour

In [64]:
# df.shape

In [65]:
# df.language.value_counts() >= 2

In [66]:
# df.groupby('language').filter(lambda x : len(x) >= 2)

In [67]:
# df.groupby("language").language.agg(["count"]).sort_values(by="count") 

In [68]:
# df[df.groupby("language").language.transform("count") >= 2]

In [69]:
# df = df[df.groupby("language").language.transform("count") >= 2]

In [70]:
# df[df.language == "ApacheConf"]

**Model #8: TF-IDF Bigrams Logistic Regression**

In [71]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Logistic Regression", model_object=logit, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565
7,TF-IDF Bigrams Logistic Regression,0.527778,0.326087


In [72]:
print('TF-IDF Bigrams Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Train Accuracy: 52.78%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.303279,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527778,0.265164,0.523452
recall,0.0,0.0,0.384615,0.0,0.0,0.0,0.181818,1.0,1.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527778,0.194988,0.527778
f1-score,0.0,0.0,0.555556,0.0,0.0,0.0,0.307692,1.0,0.465409,0.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527778,0.191433,0.449039
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.527778,180.0,180.0


In [73]:
print('TF-IDF Bigrams Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Logistic Regression Model Test Accuracy: 32.61%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.311111,0.0,1.0,0.0,0.0,0.0,0.326087,0.119192,0.268599
recall,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.125,0.0,0.0,0.0,0.326087,0.102273,0.326087
f1-score,0.0,0.0,0.0,0.0,0.0,0.474576,0.0,0.222222,0.0,0.0,0.0,0.326087,0.063345,0.183084
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.326087,46.0,46.0


**Model #9: TF-IDF Unigrams & Bigrams Logistics Regression**

In [74]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
logit = LogisticRegression(random_state=56)

# fit model object
logit.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = logit.predict(X_train)
test['predicted'] = logit.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Unigram & Bigram Logistic Regression", model_object=logit, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
8,TF-IDF Unigram & Bigram Logistic Regression,0.6,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565
7,TF-IDF Bigrams Logistic Regression,0.527778,0.326087


In [75]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Train Accuracy: 60.00%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.33945,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.6,0.316972,0.580887
recall,0.0,0.0,0.692308,0.0,0.0,0.0,0.272727,1.0,1.0,0.916667,...,1.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.6,0.249641,0.6
f1-score,0.0,0.0,0.818182,0.0,0.0,0.0,0.428571,1.0,0.506849,0.956522,...,1.0,0.0,0.0,0.0,0.0,0.2,0.0,0.6,0.245506,0.524346
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.6,180.0,180.0


In [76]:
print('TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Unigram & Bigram Logistic Regression Model Test Accuracy: 41.30%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.75,0.368421,0.0,0.5,0.0,0.0,0.0,0.413043,0.147129,0.329519
recall,0.0,0.0,0.0,0.0,0.375,1.0,0.0,0.25,0.0,0.0,0.0,0.413043,0.147727,0.413043
f1-score,0.0,0.0,0.0,0.0,0.5,0.538462,0.0,0.333333,0.0,0.0,0.0,0.413043,0.124709,0.308807
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.413043,46.0,46.0


**Model #10: Naive Bayes**

In [77]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer()

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Naive Bayes", model_object=nb, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
8,TF-IDF Unigram & Bigram Logistic Regression,0.6,0.413043
9,TF-IDF Naive Bayes,0.45,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565
7,TF-IDF Bigrams Logistic Regression,0.527778,0.326087


In [78]:
print('TF-IDF Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Naive Bayes Model Train Accuracy: 45.00%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.272059,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.213603,0.444812
recall,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,1.0,1.0,0.166667,...,0.631579,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.149003,0.45
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.307692,1.0,0.427746,0.285714,...,0.774194,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.139767,0.363053
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.45,180.0,180.0


In [79]:
print('TF-IDF Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Naive Bayes Model Test Accuracy: 41.30%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.8,0.35,0.0,1.0,0.0,0.0,0.0,0.413043,0.195455,0.419565
recall,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.125,0.0,0.0,0.0,0.413043,0.147727,0.413043
f1-score,0.0,0.0,0.0,0.0,0.615385,0.518519,0.0,0.222222,0.0,0.0,0.0,0.413043,0.123284,0.303481
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.413043,46.0,46.0


**Model #11: TF-IDF Bigrams Naive Bayes**

In [80]:
# establish features and target for tfidf logit model

# create cv_bigrams vectorizer object
tfidf = TfidfVectorizer(ngram_range=(2, 2))

# use tfidf object to create model features
X = tfidf.fit_transform(df.clean_readme_contents)

# establish model target
y = df.language

# split data
X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

# create train and test DataFrames with actual language variable
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# create model object
nb = MultinomialNB()

# fit model object
nb.fit(X_train, y_train)

# create predicted variable in train and test DataFrames
train['predicted'] = nb.predict(X_train)
test['predicted'] = nb.predict(X_test)

# append evaluation
evaluation = ev.append_evaluation(evaluation, model_type="TF-IDF Bigrams Naive Bayes", model_object=nb, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
evaluation.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,model_type,train_accuracy,test_accuracy
3,"TF-IDF Bigrams Gini, 5",0.405556,0.5
2,"CV Bigrams Gini, 5",0.4,0.478261
5,TF-IDF Logistic Regression,0.605556,0.456522
6,"TF-IDF Entropy, 5, 2, Train 70%",0.594937,0.455882
4,"TF-IDF Unigrams & Bigrams Gini, 5",0.588889,0.434783
0,"CV Gini, 5",0.588889,0.413043
8,TF-IDF Unigram & Bigram Logistic Regression,0.6,0.413043
9,TF-IDF Naive Bayes,0.45,0.413043
1,"TF-IDF Gini, 5",0.566667,0.369565
7,TF-IDF Bigrams Logistic Regression,0.527778,0.326087


In [81]:
nb.predict(X_test)

array(['JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'Python', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript', 'JavaScript', 'JavaScript', 'JavaScript',
       'JavaScript'], dtype='<U16')

In [82]:
print('TF-IDF Bigrams Naive Bayes Model Train Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
pd.DataFrame(classification_report(train.actual, train.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Train Accuracy: 47.78%
---


Unnamed: 0,C,C#,C++,CSS,Go,Groovy,HTML,Java,JavaScript,Jupyter Notebook,...,Python,R,Ruby,Rust,Shell,TypeScript,Vue,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.282443,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477778,0.164122,0.38028
recall,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477778,0.159091,0.477778
f1-score,0.0,0.0,0.0,0.0,0.0,0.0,0.307692,1.0,0.440476,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.477778,0.137408,0.370457
support,8.0,2.0,13.0,5.0,8.0,2.0,11.0,28.0,37.0,12.0,...,19.0,2.0,6.0,2.0,1.0,9.0,3.0,0.477778,180.0,180.0


In [83]:
print('TF-IDF Bigrams Naive Bayes Model Test Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
pd.DataFrame(classification_report(test.actual, test.predicted, output_dict=True))

TF-IDF Bigrams Naive Bayes Model Test Accuracy: 32.61%
---


Unnamed: 0,C,C++,Go,HTML,Java,JavaScript,PHP,Python,R,Ruby,Shell,accuracy,macro avg,weighted avg
precision,0.0,0.0,0.0,0.0,0.0,0.311111,0.0,1.0,0.0,0.0,0.0,0.326087,0.119192,0.268599
recall,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.125,0.0,0.0,0.0,0.326087,0.102273,0.326087
f1-score,0.0,0.0,0.0,0.0,0.0,0.474576,0.0,0.222222,0.0,0.0,0.0,0.326087,0.063345,0.183084
support,2.0,3.0,2.0,3.0,8.0,14.0,1.0,8.0,1.0,1.0,3.0,0.326087,46.0,46.0


---
# Write a Function

In [84]:
def create_vectorizer_features_and_target():
    """
    This function does the following:
    1. Wrangles README data into a DataFrame
    2. Creates a tfidf vectorizer object
    3. Fits and transforms the clean_readme_contents Series from the df to create features
    4. Establishes a model target
    5. Splits the features (X) and target (y) into train and test
    6. Returns the vectorizer, train features, and train target to be used to fit the model
    """
    

    # wrangle data to train model
    df = pr.wrangle_readme_data()

    # create tfidf vectorizer object
    tfidf = TfidfVectorizer()

    # use tfidf object to create model features
    X = tfidf.fit_transform(df.clean_readme_contents)

    # establish model target
    y = df.language

    # split data
    X_train, X_test, y_train, y_test = pp.split_repo_data(X, y)

    return tfidf, X_train, y_train

In [85]:
vectorizer, features, target = create_vectorizer_features_and_target()
print(vectorizer)
features

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


<180x21895 sparse matrix of type '<class 'numpy.float64'>'
	with 57821 stored elements in Compressed Sparse Row format>

In [86]:
target

7                     R
128          TypeScript
109                Ruby
40                 Rust
224                  Go
             ...       
122                HTML
162    Jupyter Notebook
192                   C
143    Jupyter Notebook
85               Python
Name: language, Length: 180, dtype: object

In [87]:
def create_and_fit_model(features, target):
    """
    This function does the following:
    1. Creates model object
    2. Fits model on features and target
    3. Returns model object
    """    

    # create model object
    tree = DecisionTreeClassifier(max_depth=5, random_state=56)

    # fit model object
    tree.fit(features, target)
    
    # return model object to be used later to predict
    return tree

In [88]:
model = create_and_fit_model(features, target)
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=56, splitter='best')

In [89]:
def predict_language(string: str) -> str:
    """
    This function does the following:
    1. Calls create_vectorizer_features_and_target function to get data to fit model
    and vectorizer to later transform input of string argument.
    2. Calls create_and_fit_model to get model that will predict language of string input
    3. Prepares the input of the string argument, the text of a README file, for modeling by calling the following
    functions:
        a. pr.basic_clean
        b. pr.tokenize
        c. pr.lemmatize
        d. pr.remove_stopwords + additional_stopwords
    4. Creates features (X) out of the string_sans_stopwords variable using the tfidf vectorizer object
    to transform
    5. Predicts the language of the features (X)
    6. Index numpy.ndarray object for string of predicted_language
    7. Returns the language_as_string variable
    """
    
    # call create_vectorizer_features_and_target function to get data to fit model
    tfidf, features, target = create_vectorizer_features_and_target()

    # call fit_model to get model that will predict language of string input
    model = create_and_fit_model(features, target)
    
    # call pr.basic_clean
    string = pr.basic_clean(string)

    # call pr.tokenize
    list_of_tokens = pr.tokenize(string)

    # call pr.lemmatize
    list_of_lemmas = pr.lemmatize(list_of_tokens)

    # additional_stopwords variable
    additional_stopwords = ["img", "1", "yes", "see", "width20", "height20", "okay_icon", "unknown"]

    # call pr.remove_stopwords
    lemmas_sans_stopwords, string_sans_stopwords = pr.remove_stopwords(list_of_lemmas, extra_stopwords=additional_stopwords, exclude_stopwords=[])

    # create X variable for model as vectorized string_sans_stopwords
    X = tfidf.transform([string_sans_stopwords])
    
    # predict language of README
    predicted_language = model.predict(X)    
    
    # index numpy.ndarray object for string of language
    language_as_string = predicted_language[0]

    return language_as_string

In [90]:
language = mo.predict_language("""GitHub Natural Language Processing Project
Purpose
This repository holds all resources used in the attainment of the goals established for the GitHub Natural Language Processing Project.

Goals
Build a model that can predict the programming language of a repository given the text data of the accompanying README file.

Data
Repository data scraped from GitHub.

Data Dictionary
repo: the name of the GitHub repository
language: the primary language the GitHub repository
readme_contents: the original contents of the README file
clean_readme_contents: the cleaned contents of the README file used in analysis and modeling
len_of_clean_readme_contents: length of the clean lemmas in the clean_readme_contents feature
Audience
The audience for this project is the layperson.

Deliverables
Need to Haves:
Model
A well-documented jupyter notebook that contains our analysis
Presentation summarizing our findings
Nice to Haves:
GUI for model
Cloning
All files necessary for cloning and reproducing the work found in the final_project.ipynb file are contained within this repository.""")

In [91]:
language

'JavaScript'

In [92]:
print(language)

JavaScript


In [93]:
type(language)

str

In [94]:
len("Pneumonoultramicroscopicsilicovolcanoconiosis")

45