# Modeling

In [1]:
import numpy as np
import pandas as pd

from pprint import pprint

import unicodedata

import re

import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import acquire as ac
import prepare as pr
import preprocessing as pp

---
## Wrangle

In [2]:
df = pr.wrangle_readme_data()
df

Unnamed: 0,repo,language,readme_contents,clean_readme_contents
0,rdpeng/ProgrammingAssignment2,R,### Introduction\n\nThis second programming as...,introduction second programming assignment req...
1,octocat/Spoon-Knife,HTML,### Well hello there!\n\nThis repository is me...,well hello repository meant provide example fo...
2,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www....",div aligncenter img srchttpswwwtensorfloworgim...
3,SmartThingsCommunity/SmartThingsPublic,Groovy,# SmartThings Public GitHub Repo\n\nAn officia...,smartthings public github repo official list s...
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",p aligncenter hrefhttpsgetbootstrapcom img src...
...,...,...,...,...
220,akveo/ngx-admin,TypeScript,"# ngx-admin [<img src=""https://i.imgur.com/oMc...",ngxadmin img srchttpsiimgurcomomcxwz0png altev...
221,swirldev/swirl_courses,R,# swirl courses\n\nThis is a collection of int...,swirl course collection interactive course use...
222,jrowberg/i2cdevlib,C++,Jennic platform added!\n\n====================...,jennic platform added note detail project plea...
223,etcd-io/etcd,Go,# etcd\n\n[![Go Report Card](https://goreportc...,etcd go report cardhttpsgoreportcardcombadgegi...


In [3]:
df.columns

Index(['repo', 'language', 'readme_contents', 'clean_readme_contents'], dtype='object')

In [4]:
df.repo.value_counts().head(18)

fengdu78/Coursera-ML-AndrewNg-Notes                                            2
tensorflow/models                                                              2
SmartThingsCommunity/SmartThingsPublic                                         2
arduino/Arduino                                                                2
phonegap/phonegap-start                                                        2
rdpeng/ProgrammingAssignment2                                                  2
crossoverJie/JCSprout                                                          2
soimort/you-get                                                                2
woocommerce/woocommerce                                                        2
nightscout/cgm-remote-monitor                                                  2
octocat/Spoon-Knife                                                            2
axios/axios                                                                    2
apache/incubator-mxnet      

In [5]:
df.repo.value_counts().tail()

webpack/webpack          1
impress/impress.js       1
airbnb/javascript        1
jquery/jquery            1
barryclark/jekyll-now    1
Name: repo, dtype: int64

In [6]:
len(df.repo.unique())

208

In [7]:
# stratification base on language using train_test_split won't work unless we have more than one observation
# per language
df.language.value_counts()

JavaScript          47
Java                35
Python              30
C++                 18
HTML                16
Jupyter Notebook    11
Go                   9
PHP                  9
Ruby                 7
TypeScript           7
CSS                  6
C                    5
Vue                  3
Rust                 3
R                    3
Shell                3
C#                   2
Groovy               2
PowerShell           2
Kotlin               2
Objective-C          1
Scala                1
TeX                  1
ApacheConf           1
Swift                1
Name: language, dtype: int64

### Train/Test Split

In [8]:
df = pr.wrangle_readme_data()
train, test = pp.split_repo_data(df)

In [9]:
print(train.shape[0]/df.shape[0])

0.8


In [10]:
print(test.shape[0]/df.shape[0])

0.2


---
## Model

In [11]:
# lemmas
list_of_readmes = df.clean_readme_contents.tolist()
list_of_readmes

['introduction second programming assignment require write r function able cache potentially timeconsuming computation example taking mean numeric vector typically fast operation however long vector may take long compute mean especially ha computed repeatedly eg loop content vector changing may make sense cache value mean need looked cache rather recomputed programming assignment take advantage scoping rule r language manipulated preserve state inside r object example caching mean vector example introduce operator used assign value object environment different current environment two function used create special object store numeric vector cache mean first function makevector creates special vector really list containing function 1 set value vector 2 get value vector 3 set value mean 4 get value mean makevector functionx numeric null set functiony x null get function x setmean functionmean mean getmean function listset set get get setmean setmean getmean getmean following function calc

In [12]:
for index in range(len(list_of_readmes)):
    list_of_readmes[index] = list_of_readmes[index].split()
    

print(list_of_readmes)



In [None]:
list_of_list_of_lemmas = [lemma.split() for lemmas in list_of_lemmas]
list_of_list_of_lemmas

In [13]:
len(list_of_readmes[0])

336

In [14]:
len(list_of_readmes[-1])

5756

In [15]:
len_of_readmes = [len(readme) for readme in list_of_readmes]
len_of_readmes

[336,
 62,
 440,
 32,
 572,
 148,
 4176,
 336,
 62,
 440,
 32,
 572,
 148,
 4176,
 14,
 50,
 580,
 280,
 176,
 58,
 770,
 327,
 39,
 79,
 46,
 623,
 499,
 261,
 134,
 713,
 641,
 218,
 694,
 208,
 49,
 442,
 153,
 311,
 740,
 131,
 532,
 479,
 934,
 354,
 3,
 445,
 1013,
 260,
 393,
 659,
 730,
 402,
 8204,
 35,
 201,
 248,
 392,
 142,
 732,
 1848,
 1425,
 1864,
 714,
 53,
 291,
 358,
 5342,
 138,
 471,
 301,
 280,
 206,
 454,
 805,
 98,
 56,
 1722,
 719,
 134,
 7299,
 10298,
 294,
 179,
 995,
 299,
 1,
 202,
 1086,
 371,
 14,
 649,
 426,
 95,
 87,
 502,
 92,
 120,
 169,
 73,
 217,
 518,
 390,
 112,
 373,
 321,
 335,
 131,
 1525,
 841,
 5769,
 148,
 216,
 319,
 332,
 954,
 675,
 1150,
 65,
 1,
 1490,
 464,
 149,
 502,
 350,
 10,
 33,
 229,
 8,
 637,
 150,
 719,
 216,
 59,
 91,
 1380,
 282,
 183,
 502,
 553,
 305,
 154,
 569,
 291,
 8,
 239,
 728,
 987,
 400,
 1248,
 439,
 216,
 256,
 1700,
 138,
 30,
 156,
 300,
 190,
 155,
 412,
 346,
 218,
 46,
 967,
 1352,
 1,
 298,
 317,
 277,
 207

In [19]:
len_of_readmes.sort()

None


In [20]:
print(len_of_readmes)

[1, 1, 1, 3, 8, 8, 10, 13, 13, 14, 14, 30, 30, 32, 32, 33, 35, 39, 46, 46, 49, 50, 53, 56, 58, 59, 62, 62, 65, 73, 79, 82, 87, 91, 92, 95, 97, 97, 98, 112, 113, 115, 120, 125, 127, 127, 131, 131, 134, 134, 138, 138, 142, 148, 148, 148, 149, 150, 153, 154, 155, 156, 169, 173, 173, 176, 179, 183, 190, 201, 202, 206, 207, 208, 208, 216, 216, 216, 217, 218, 218, 218, 218, 219, 229, 230, 230, 239, 248, 251, 256, 260, 261, 277, 280, 280, 282, 287, 291, 291, 294, 296, 298, 299, 300, 301, 302, 305, 311, 317, 319, 321, 327, 332, 335, 336, 336, 340, 346, 350, 354, 358, 371, 373, 390, 392, 393, 396, 397, 400, 402, 405, 412, 426, 439, 440, 440, 442, 445, 451, 454, 464, 471, 479, 497, 499, 502, 502, 502, 511, 511, 518, 522, 529, 532, 549, 553, 569, 572, 572, 580, 586, 616, 623, 637, 638, 641, 649, 659, 675, 694, 713, 714, 719, 719, 719, 728, 730, 732, 740, 770, 805, 838, 838, 841, 934, 947, 954, 967, 972, 987, 995, 1013, 1072, 1086, 1150, 1187, 1187, 1205, 1248, 1343, 1343, 1352, 1380, 1425, 1490, 

**Cutoff for length of words in readme = 10**