In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize
from prepare import prepare

In [2]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [3]:
df = pd.DataFrame(data)

# Check out prepare for prepare details

In [4]:
train, validate, test = prepare(df)

In [5]:
train.size, validate.size, test.size

(348, 150, 126)

In [6]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
74,Ryujinx/Ryujinx,C#,"\n<h1>\n <img src=""https://i.imgur.com/G6Ml...",h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...
71,egametang/ET,C#,# [中文](https://github.com/egametang/Egametang/...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...
89,bombomby/optick,C#,# [Optick: C++ Profiler For Games](https://opt...,optick c profiler gameshttpsoptickdev githubht...,optick c profil gameshttpsoptickdev githubhttp...,optick c profiler gameshttpsoptickdev githubht...
1,cl2333/Grokking-the-Coding-Interview-Patterns-...,Python,# [Grokking-the-Coding-Interview-Patterns-for-...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...
100,gustavoguanabara/html-css,HTML,"<img src=""imagens/mascote.png"" align=""right"" w...",img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...


# No duplicates

In [7]:
df[df.duplicated()]

Unnamed: 0,repo,language,readme_contents


# Exploration

In [8]:
def clean(text):
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = []
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [9]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "language")

Unnamed: 0,n,percent
HTML,29,0.243697
C#,27,0.226891
Python,25,0.210084
JavaScript,23,0.193277
Java,15,0.12605


In [10]:
python_words = clean(' '.join(df[df.language == 'Python'].readme_contents))

In [11]:
c_sharp_words = clean(' '.join(df[df.language == 'C#'].readme_contents))

In [12]:
html_words = clean(' '.join(df[df.language == 'HTML'].readme_contents))

In [13]:
javascript_words = clean(' '.join(df[df.language == 'JavaScript'].readme_contents))

In [14]:
java_words = clean(' '.join(df[df.language == 'Java'].readme_contents))

In [15]:
ruby_words = clean(' '.join(df[df.language == 'Ruby'].readme_contents))

In [16]:
shell_words = clean(' '.join(df[df.language == 'Shell'].readme_contents))

### Freq

In [17]:
python_freq = pd.Series(python_words).value_counts()

In [18]:
c_sharp_freq = pd.Series(c_sharp_words).value_counts()

In [19]:
html_freq = pd.Series(html_words).value_counts()

In [20]:
javascript_freq = pd.Series(javascript_words).value_counts()

In [21]:
java_freq = pd.Series(java_words).value_counts()

In [22]:
ruby_freq = pd.Series(ruby_words).value_counts()

  ruby_freq = pd.Series(ruby_words).value_counts()


In [23]:
shell_freq = pd.Series(shell_words).value_counts()

  shell_freq = pd.Series(shell_words).value_counts()


In [24]:
python_freq.head()

yes        1639
unknown     926
apikey      573
video       489
python      447
dtype: int64

In [25]:
c_sharp_freq.head()

new       192
c         188
use       186
var       185
csharp    156
dtype: int64

In [26]:
html_freq.head()

html        97
file        76
use         76
15001700    62
dom         51
dtype: int64

In [27]:
javascript_freq.head()

javascript    309
function      276
const         261
1             240
bad           205
dtype: int64

In [28]:
java_freq.head()

altbr           189
aligncentera    189
width100px      189
td              189
java            177
dtype: int64

### Ruby and Shell only have one observation so we can either drop or go find more

In [29]:
ruby_freq.head()

Series([], dtype: int64)

In [30]:
shell_freq.head()

Series([], dtype: int64)

# Explore on Train 

In [31]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
74,Ryujinx/Ryujinx,C#,"\n<h1>\n <img src=""https://i.imgur.com/G6Ml...",h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...
71,egametang/ET,C#,# [中文](https://github.com/egametang/Egametang/...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...
89,bombomby/optick,C#,# [Optick: C++ Profiler For Games](https://opt...,optick c profiler gameshttpsoptickdev githubht...,optick c profil gameshttpsoptickdev githubhttp...,optick c profiler gameshttpsoptickdev githubht...
1,cl2333/Grokking-the-Coding-Interview-Patterns-...,Python,# [Grokking-the-Coding-Interview-Patterns-for-...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...
100,gustavoguanabara/html-css,HTML,"<img src=""imagens/mascote.png"" align=""right"" w...",img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...


In [32]:
train.stemmed[0]

"nostalgiaforinfin trade strategi freqtradehttpswwwfreqtradeio crypto bot clone repositori plan clone repositori use strategi regular git clone howev plan run addit strategi run test suit need clone repositori ' submodul newer version git bash git clone recursesubmodul httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath older version git bash git clone recurs httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath exist checkout git submodul updat remot checkout chang strategi add strategi userdatastrategiesuserdatastrategi folder also dockercomposeymldockercomposeyml file strategylist add strategi list gener recommend optim perform suggest use 4 6 open trade unlimit stake pairlist 40 80 pair volum pairlist work well prefer stabl coin usdt busd etc pair instead btc eth pair highli recommend blacklist leverag token bull bear etc ensur ' overrid variabl configjson especi timefram must 5m usesellsign must set true set sellprofitonli must set fals set ignoreroiifbuysign must se

In [33]:
train.clean[0]

"nostalgiaforinfinity trading strategy freqtradehttpswwwfreqtradeio crypto bot clone repository plan clone repository use strategy regular git clone however plan running additional strategies run test suite need clone repository ' submodules newer versions git bash git clone recursesubmodules httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath older versions git bash git clone recursive httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath existing checkouts git submodule update remote checkout change strategy add strategies userdatastrategiesuserdatastrategies folder also dockercomposeymldockercomposeyml file strategylist add strategy list general recommendations optimal performance suggested use 4 6 open trades unlimited stake pairlist 40 80 pairs volume pairlist works well prefer stable coin usdt busd etc pairs instead btc eth pairs highly recommended blacklist leveraged tokens bull bear etc ensure ' override variables configjson especially timeframe must 5m usesellsig

In [34]:
train.lemmatized[0]

"nostalgiaforinfinity trading strategy freqtradehttpswwwfreqtradeio crypto bot clone repository plan clone repository use strategy regular git clone however plan running additional strategy run test suite need clone repository ' submodules newer version git bash git clone recursesubmodules httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath older version git bash git clone recursive httpsgithubcomiterativvnostalgiaforinfinitygit checkoutpath existing checkout git submodule update remote checkout change strategy add strategy userdatastrategiesuserdatastrategies folder also dockercomposeymldockercomposeyml file strategylist add strategy list general recommendation optimal performance suggested use 4 6 open trade unlimited stake pairlist 40 80 pair volume pairlist work well prefer stable coin usdt busd etc pair instead btc eth pair highly recommended blacklist leveraged token bull bear etc ensure ' override variable configjson especially timeframe must 5m usesellsignal must set tr

In [35]:
labels = pd.concat([train.language.value_counts(),
                    train.language.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

Unnamed: 0,n,percent
HTML,16,0.275862
C#,15,0.258621
Python,14,0.241379
JavaScript,13,0.224138


In [37]:
train.columns

Index(['repo', 'language', 'readme_contents', 'clean', 'stemmed',
       'lemmatized'],
      dtype='object')

In [62]:
ADDITIONAL_STOPWORDS = ['png']

def clean_up(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [63]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([train[column].value_counts(),
                    train[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "language")

Unnamed: 0,n,percent
HTML,16,0.275862
C#,15,0.258621
Python,14,0.241379
JavaScript,13,0.224138


In [84]:
html_words = clean_up(' '.join(train[train.language == 'HTML'].stemmed))

In [85]:
html_freq = pd.DataFrame(html_words)

In [87]:
html_freq.value_counts()

9           228
use          62
15001700     62
html         60
email        44
           ... 
go            1
got           1
gpl           1
gplv3         1
instanc       1
Length: 1486, dtype: int64

In [74]:
python_words = clean_up(' '.join(train[train.language == 'Python'].stemmed))

In [75]:
p_words = pd.DataFrame(python_words)

In [76]:
p_words.value_counts()

use                                                                                                     209
video                                                                                                   202
youtubedl                                                                                               154
download                                                                                                154
file                                                                                                    139
                                                                                                       ... 
filterclass                                                                                               1
nearli                                                                                                    1
near                                                                                                      1
ncfoxnts3dhmrg2fupnr9mpv45nl

In [89]:
html_words = clean_up(' '.join(train[train.language == 'HTML'].clean))
python_words = clean_up(' '.join(train[train.language == 'Python'].clean))
C_sharp_words = clean_up(' '.join(train[train.language == 'C#'].clean))
JavaScript_words = clean_up(' '.join(train[train.language == 'JavaScript'].clean))

all_words = clean_up(' '.join(train.clean))

In [91]:
html_freq = pd.Series(html_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
C_sharp_freq = pd.Series(C_sharp_words).value_counts()
JavaScript_freq = pd.Series(JavaScript_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

print(html_freq.head())
print ("--------------")
print(python_freq.head())
print ("--------------")
print(C_sharp_freq.head())
print ("--------------")
print(JavaScript_freq.head())
print ("--------------")
print(all_freq.head())

9           228
15001700     62
html         60
email        44
use          39
dtype: int64
--------------
video        202
youtubedl    154
file         135
use          117
format       102
dtype: int64
--------------
9       262
use     130
c       128
new     124
type    113
dtype: int64
--------------
javascript    264
const         263
function      226
bad           203
1             191
dtype: int64
--------------
9             609
use           410
function      289
javascript    276
const         263
dtype: int64


In [92]:
#Josh's code
#words = []
#for word in list(train[train.language == 'Python'].lemmatized):
#    words.extend(word.split())

In [None]:
#python_words = pd.DataFrame(words)

In [93]:
#python_words.value_counts()