In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize
from prepare import prepare

In [2]:
with open('data.json') as json_file:
    data = json.load(json_file)

In [29]:
df = pd.DataFrame(data)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             123 non-null    object
 1   language         121 non-null    object
 2   readme_contents  123 non-null    object
dtypes: object(3)
memory usage: 3.0+ KB


In [31]:
df = df[df.language != "Shell"]

In [32]:
df = df[df.language != "Java"]

In [33]:
df = df[df.language != "Ruby"]

In [34]:
df.language = df.language.astype('string')

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 122
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             106 non-null    object
 1   language         104 non-null    string
 2   readme_contents  106 non-null    object
dtypes: object(2), string(1)
memory usage: 3.3+ KB


In [37]:
df = df[~df['language'].isnull()]

In [38]:
df.language.value_counts(dropna=False)

HTML          29
C#            27
Python        25
JavaScript    23
Name: language, dtype: Int64

In [24]:
type(df.language[0])

str

# Check out prepare for prepare details

In [39]:
train, validate, test = prepare(df)

In [None]:
train.size, validate.size, test.size

In [None]:
train.head()

# No duplicates

In [None]:
df[df.duplicated()]

# Exploration

In [None]:
def clean(text):
    'A simple function to cleanup text data'
    
    ADDITIONAL_STOPWORDS = []
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "language")

In [None]:
python_words = clean(' '.join(df[df.language == 'Python'].readme_contents))

In [None]:
c_sharp_words = clean(' '.join(df[df.language == 'C#'].readme_contents))

In [None]:
html_words = clean(' '.join(df[df.language == 'HTML'].readme_contents))

In [None]:
javascript_words = clean(' '.join(df[df.language == 'JavaScript'].readme_contents))

In [None]:
java_words = clean(' '.join(df[df.language == 'Java'].readme_contents))

In [None]:
ruby_words = clean(' '.join(df[df.language == 'Ruby'].readme_contents))

In [None]:
shell_words = clean(' '.join(df[df.language == 'Shell'].readme_contents))

### Freq

In [None]:
python_freq = pd.Series(python_words).value_counts()

In [None]:
c_sharp_freq = pd.Series(c_sharp_words).value_counts()

In [None]:
html_freq = pd.Series(html_words).value_counts()

In [None]:
javascript_freq = pd.Series(javascript_words).value_counts()

In [None]:
java_freq = pd.Series(java_words).value_counts()

In [None]:
ruby_freq = pd.Series(ruby_words).value_counts()

In [None]:
shell_freq = pd.Series(shell_words).value_counts()

In [None]:
python_freq.head()

In [None]:
c_sharp_freq.head()

In [None]:
html_freq.head()

In [None]:
javascript_freq.head()

In [None]:
java_freq.head()

### Ruby and Shell only have one observation so we can either drop or go find more

In [None]:
ruby_freq.head()

In [None]:
shell_freq.head()