In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize
from prepare import prepare

In [2]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [111]:
df = pd.DataFrame(data)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             123 non-null    object
 1   language         121 non-null    object
 2   readme_contents  123 non-null    object
dtypes: object(3)
memory usage: 3.0+ KB


In [5]:
df = df[df.language != "Shell"]

In [6]:
df = df[df.language != "Java"]

In [7]:
df = df[df.language != "Ruby"]

In [8]:
df.language = df.language.astype('string')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 122
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             106 non-null    object
 1   language         104 non-null    string
 2   readme_contents  106 non-null    object
dtypes: object(2), string(1)
memory usage: 3.3+ KB


In [10]:
df = df[~df['language'].isnull()]

In [11]:
df.language.value_counts(dropna=False)

HTML          29
C#            27
Python        25
JavaScript    23
Name: language, dtype: Int64

In [12]:
type(df.language[0])

str

# Check out prepare for prepare details

In [13]:
train, validate, test = prepare(df)

In [14]:
train.size, validate.size, test.size

(348, 150, 126)

In [15]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
76,Ryujinx/Ryujinx,C#,"\n<h1>\n <img src=""https://i.imgur.com/G6Ml...",h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...,h1 img srchttpsiimgurcomg6mlecopng ryujinx hre...
73,egametang/ET,C#,# [中文](https://github.com/egametang/Egametang/...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...,httpsgithubcomegametangegametangblobmasterread...
91,bombomby/optick,C#,# [Optick: C++ Profiler For Games](https://opt...,optick c profiler gameshttpsoptickdev githubht...,optick c profil gameshttpsoptickdev githubhttp...,optick c profiler gameshttpsoptickdev githubht...
1,cl2333/Grokking-the-Coding-Interview-Patterns-...,Python,# [Grokking-the-Coding-Interview-Patterns-for-...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...,grokkingthecodinginterviewpatternsforcodingque...
102,gustavoguanabara/html-css,HTML,"<img src=""imagens/mascote.png"" align=""right"" w...",img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...,img srcimagensmascotepng alignright width300 c...


# No duplicates

In [16]:
df[df.duplicated()]

Unnamed: 0,repo,language,readme_contents


# Exploration

In [101]:
def clean(text):
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = []
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [18]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "language")

Unnamed: 0,n,percent
HTML,29,0.278846
C#,27,0.259615
Python,25,0.240385
JavaScript,23,0.221154


In [102]:
python_words = clean(' '.join(train[train.language == 'Python'].readme_contents))

In [103]:
p_words = pd.DataFrame(python_words)

In [104]:
p_words.value_counts()

video                     200
youtubedl                 150
file                      135
use                       117
format                    102
                         ... 
botwritingyourfirstbot      1
liked                       1
bottelegramshodan           1
likewise                    1
july                        1
Length: 4263, dtype: int64

In [108]:
words = []
for word in list(train[train.language == 'Python'].lemmatized):
    words.extend(word.split())

In [109]:
python_words = pd.DataFrame(words)

In [110]:
python_words.value_counts()

'            524
video        202
youtubedl    154
file         135
use          117
            ... 
btt            1
likewise       1
limited        1
limitrate      1
joining        1
Length: 4216, dtype: int64

In [20]:
c_sharp_words = clean(' '.join(df[df.language == 'C#'].readme_contents))

In [21]:
html_words = clean(' '.join(df[df.language == 'HTML'].readme_contents))

In [22]:
javascript_words = clean(' '.join(df[df.language == 'JavaScript'].readme_contents))

In [23]:
java_words = clean(' '.join(df[df.language == 'Java'].readme_contents))

In [24]:
ruby_words = clean(' '.join(df[df.language == 'Ruby'].readme_contents))

In [25]:
shell_words = clean(' '.join(df[df.language == 'Shell'].readme_contents))

### Freq

In [26]:
python_freq = pd.Series(python_words).value_counts()

In [27]:
c_sharp_freq = pd.Series(c_sharp_words).value_counts()

In [28]:
html_freq = pd.Series(html_words).value_counts()

In [29]:
javascript_freq = pd.Series(javascript_words).value_counts()

In [30]:
java_freq = pd.Series(java_words).value_counts()

  java_freq = pd.Series(java_words).value_counts()


In [31]:
ruby_freq = pd.Series(ruby_words).value_counts()

  ruby_freq = pd.Series(ruby_words).value_counts()


In [32]:
shell_freq = pd.Series(shell_words).value_counts()

  shell_freq = pd.Series(shell_words).value_counts()


In [33]:
python_freq.head()

yes        1615
unknown     909
apikey      558
video       489
python      447
dtype: int64

In [34]:
c_sharp_freq.head()

new       192
c         188
use       186
var       185
csharp    156
dtype: int64

In [35]:
html_freq.head()

html        97
use         76
file        76
15001700    62
dom         51
dtype: int64

In [36]:
javascript_freq.head()

javascript    309
function      276
const         261
1             240
bad           205
dtype: int64

In [37]:
java_freq.head()

Series([], dtype: int64)

### Ruby and Shell only have one observation so we can either drop or go find more

In [38]:
ruby_freq.head()

Series([], dtype: int64)

In [39]:
shell_freq.head()

Series([], dtype: int64)