In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize
from prepare import prepare

In [2]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [3]:
df = pd.DataFrame(data)

# Check out prepare for prepare details

In [4]:
df = prepare(df)

In [5]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,iterativv/NostalgiaForInfinity,Python,# NostalgiaForInfinity\nTrading strategy for t...,nostalgiaforinfinity trading strategy freqtrad...,nostalgiaforinfin trade strategi freqtradehttp...,nostalgiaforinfinity trading strategy freqtrad...
1,cl2333/Grokking-the-Coding-Interview-Patterns-...,Python,# [Grokking-the-Coding-Interview-Patterns-for-...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...
2,edeng23/binance-trade-bot,Python,# binance-trade-bot\n\n![github](https://img.s...,binancetradebot githubhttpsimgshieldsiogithubw...,binancetradebot githubhttpsimgshieldsiogithubw...,binancetradebot githubhttpsimgshieldsiogithubw...
3,donnemartin/system-design-primer,Python,*[English](README.md) ∙ [日本語](README-ja.md) ∙ ...,englishreadmemd readmejamd readmezhhansmd read...,englishreadmemd readmejamd readmezhhansmd read...,englishreadmemd readmejamd readmezhhansmd read...
4,dortania/OpenCore-Legacy-Patcher,Python,"<div align=""center"">\n <img src=""i...",div aligncenter img srcimagesocpatcherpng alto...,div aligncent img srcimagesocpatcherpng altope...,div aligncenter img srcimagesocpatcherpng alto...


# No duplicates

In [6]:
df[df.duplicated()]

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized


# Exploration

In [7]:
def clean(text):
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = []
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [8]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "language")

Unnamed: 0,n,percent
HTML,29,0.239669
C#,27,0.22314
Python,25,0.206612
JavaScript,23,0.190083
Java,15,0.123967
Shell,1,0.008264
Ruby,1,0.008264


In [9]:
python_words = clean(' '.join(df[df.language == 'Python'].readme_contents))

In [10]:
c_sharp_words = clean(' '.join(df[df.language == 'C#'].readme_contents))

In [11]:
html_words = clean(' '.join(df[df.language == 'HTML'].readme_contents))

In [12]:
javascript_words = clean(' '.join(df[df.language == 'JavaScript'].readme_contents))

In [13]:
java_words = clean(' '.join(df[df.language == 'Java'].readme_contents))

In [14]:
ruby_words = clean(' '.join(df[df.language == 'Ruby'].readme_contents))

In [15]:
shell_words = clean(' '.join(df[df.language == 'Shell'].readme_contents))

### Freq

In [16]:
python_freq = pd.Series(python_words).value_counts()

In [17]:
c_sharp_freq = pd.Series(c_sharp_words).value_counts()

In [18]:
html_freq = pd.Series(html_words).value_counts()

In [19]:
javascript_freq = pd.Series(javascript_words).value_counts()

In [20]:
java_freq = pd.Series(java_words).value_counts()

In [21]:
ruby_freq = pd.Series(ruby_words).value_counts()

In [22]:
shell_freq = pd.Series(shell_words).value_counts()

In [23]:
python_freq.head()

yes        1615
unknown     909
apikey      558
video       489
python      447
dtype: int64

In [24]:
c_sharp_freq.head()

new       192
c         188
use       186
var       185
csharp    156
dtype: int64

In [25]:
html_freq.head()

html        97
use         76
file        76
15001700    62
dom         51
dtype: int64

In [26]:
javascript_freq.head()

javascript    309
function      276
const         261
1             240
bad           205
dtype: int64

In [27]:
java_freq.head()

aligncentera    189
td              189
width100px      189
altbr           189
java            177
dtype: int64

### Ruby and Shell only have one observation so we can either drop or go find more

In [28]:
ruby_freq.head()

cookbook      6
chef          5
amazon        2
project       2
maintainer    2
dtype: int64

In [29]:
shell_freq.head()

java              13
docker            10
run                7
dockerfilejava     5
jdk                5
dtype: int64