# NLP Group Project
### David's Personal Notebook

In [1]:
import pandas as pd
import numpy as np
import os
import json
import csv
import requests

import unicodedata
import string
import time
import random
from bs4 import BeautifulSoup
from requests import get
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

import env
import acquire
import prepare

## Acquire
Was not able to acquire dataset utilizing acquire.py due to computational limitations. \
Team member Josh acquired and uploaded the dataset as a csv for our team's convenience.

In [2]:
# Reading csv into notebook as df
df = pd.read_csv('~/codeup/data.csv')

In [3]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...
1,996icu/996.ICU,,[996.ICU](https://996.icu/#/en_US)\n=======\n*...
2,EbookFoundation/free-programming-books,,# List of Free Learning Resources In Many Lang...
3,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...
4,kamranahmedse/developer-roadmap,TypeScript,"<p align=""center"">\n <img src=""public/brand.p..."


In [4]:
df.shape

(1031, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031 entries, 0 to 1030
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             1031 non-null   object
 1   language         917 non-null    object
 2   readme_contents  1027 non-null   object
dtypes: object(3)
memory usage: 24.3+ KB


## Prepare

In [6]:
df.language.isnull().value_counts()

False    917
True     114
Name: language, dtype: int64

In [7]:
# Team concludes languages that are null be labeled "English"
df.language = df.language.fillna('English')

In [8]:
df.language.value_counts()

JavaScript          232
Python              114
English             114
TypeScript          110
Go                   78
Java                 70
C++                  56
C                    30
HTML                 23
Shell                22
Rust                 21
Swift                18
Ruby                 18
Jupyter Notebook     14
PHP                  14
C#                   13
Kotlin               12
CSS                  12
Vue                   8
Objective-C           5
Lua                   4
Dart                  4
Vim script            3
Haskell               3
Markdown              3
SCSS                  3
TeX                   3
Elixir                2
Vim Script            2
Clojure               2
Makefile              2
Zig                   1
Emacs Lisp            1
OCaml                 1
Julia                 1
CoffeeScript          1
Objective-C++         1
AsciiDoc              1
Nunjucks              1
Batchfile             1
Scala                 1
Jinja           

In [9]:
df.isnull().sum()

repo               0
language           0
readme_contents    4
dtype: int64

Number of nulls in readme_contents is negligable. Thus, I'll drop these rows.

In [10]:
# Dropping 4 null rows
df = df.dropna()

In [11]:
df.isnull().sum()

repo               0
language           0
readme_contents    0
dtype: int64

In order to avoid data imbalance, we limit languages to those who have at least 10 repo instances within our dataset.

In [12]:
# determining remaining list of languages
languages_kept = df.language.value_counts()[df.language.value_counts() > 10].index

In [13]:
languages_kept

Index(['JavaScript', 'Python', 'English', 'TypeScript', 'Go', 'Java', 'C++',
       'C', 'HTML', 'Shell', 'Rust', 'Swift', 'Ruby', 'Jupyter Notebook',
       'PHP', 'C#', 'Kotlin', 'CSS'],
      dtype='object')

In [14]:
# Removing languages not kept
df = df[df.language.isin(languages_kept)]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967 entries, 0 to 1030
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             967 non-null    object
 1   language         967 non-null    object
 2   readme_contents  967 non-null    object
dtypes: object(3)
memory usage: 30.2+ KB


In [16]:
df['readme_contents'] =  df['readme_contents'].astype('string')

In [17]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,TypeScript,[![freeCodeCamp Social Banner](https://s3.amaz...
1,996icu/996.ICU,English,[996.ICU](https://996.icu/#/en_US) ======= **P...
2,EbookFoundation/free-programming-books,English,# List of Free Learning Resources In Many Lang...
3,jwasham/coding-interview-university,English,# Coding Interview University > I originally ...
4,kamranahmedse/developer-roadmap,TypeScript,"<p align=""center"">  <img src=""public/brand.pn..."


In [18]:
test_clean = df['readme_contents'].map(lambda x: prepare.basic_clean(x))

In [19]:
test_clean.isnull().sum()

0

Below, I utilize our prepare.py in order to clean, tokenize, and stem our `readme_contents` 

In [20]:
df['readme_contents'] = df['readme_contents'].map(lambda x: prepare.basic_clean(x))

In [21]:
df['readme_contents'] = df['readme_contents'].map(lambda x: prepare.tokenize(x))

In [22]:
df['stemmed'] = df['readme_contents'].map(lambda x: prepare.stem(x))

In [23]:
df['lemma'] = df['readme_contents'].map(lambda x: prepare.lemmatize(x))

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967 entries, 0 to 1030
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             967 non-null    object
 1   language         967 non-null    object
 2   readme_contents  967 non-null    object
 3   stemmed          967 non-null    object
 4   lemma            967 non-null    object
dtypes: object(5)
memory usage: 45.3+ KB


In [25]:
df.head()

Unnamed: 0,repo,language,readme_contents,stemmed,lemma
0,freeCodeCamp/freeCodeCamp,TypeScript,freecodecamp social requests source freecodeca...,freecodecamp social request sourc freecodecamp...,freecodecamp social request source freecodecam...
1,996icu/996.ICU,English,note that there exists no other official accou...,note that there exist no other offici account ...,note that there exists no other official accou...
2,EbookFoundation/free-programming-books,English,list of free learning resources in many alignc...,list of free learn resourc in mani aligncent c...,list of free learning resource in many alignce...
3,jwasham/coding-interview-university,English,coding interview i originally created this as ...,code interview i origin creat thi as a short t...,coding interview i originally created this a a...
4,kamranahmedse/developer-roadmap,TypeScript,p img srcpublicbrandpng h2 p aligncentercommun...,p img srcpublicbrandpng h2 p aligncentercommun...,p img srcpublicbrandpng h2 p aligncentercommun...


In [26]:
df.language.value_counts()

JavaScript          232
Python              114
English             114
TypeScript          109
Go                   76
Java                 70
C++                  55
C                    30
HTML                 23
Shell                22
Rust                 21
Ruby                 18
Swift                18
PHP                  14
Jupyter Notebook     14
C#                   13
CSS                  12
Kotlin               12
Name: language, dtype: int64

In [27]:
df_explore = df

In [28]:
df_explore.shape

(967, 5)

In [29]:
df_explore['language'] =  df_explore['language'].astype('string')

In [30]:
df_explore.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967 entries, 0 to 1030
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             967 non-null    object
 1   language         967 non-null    string
 2   readme_contents  967 non-null    object
 3   stemmed          967 non-null    object
 4   lemma            967 non-null    object
dtypes: object(4), string(1)
memory usage: 45.3+ KB


In [31]:
df_explore.language.value_counts()

JavaScript          232
Python              114
English             114
TypeScript          109
Go                   76
Java                 70
C++                  55
C                    30
HTML                 23
Shell                22
Rust                 21
Ruby                 18
Swift                18
PHP                  14
Jupyter Notebook     14
C#                   13
CSS                  12
Kotlin               12
Name: language, dtype: Int64

In [32]:
df_explore.lemma

0       freecodecamp social request source freecodecam...
1       note that there exists no other official accou...
2       list of free learning resource in many alignce...
3       coding interview i originally created this a a...
4       p img srcpublicbrandpng h2 p aligncentercommun...
                              ...                        
1026    a href src join the the tiny version of gfwlis...
1027    microsoft rest api microsoft rest api guidelin...
1028    the chat at version license build status cover...
1029    mpv external system release bug external is a ...
1030    div width400px height100px width400px height10...
Name: lemma, Length: 967, dtype: object

In [33]:
def get_word_freq(document, max=None, min=None):
    if type(document) != str:
        document = ' '.join(document)
    bag =  document.split(' ')
    series = pd.Series(bag)
    vc = series.value_counts()
    if max:
        vc = vc[vc <= max]
    if min:
        vc = vc[vc >= min]
    return vc


In [34]:
df_explore.language.value_counts()

JavaScript          232
Python              114
English             114
TypeScript          109
Go                   76
Java                 70
C++                  55
C                    30
HTML                 23
Shell                22
Rust                 21
Ruby                 18
Swift                18
PHP                  14
Jupyter Notebook     14
C#                   13
CSS                  12
Kotlin               12
Name: language, dtype: Int64

In [35]:
javascript_words = (' '.join(df_explore[df_explore.language == 'JavaScript']['lemma']))
python_words = (' '.join(df_explore[df_explore.language == 'Python']['lemma']))
english_words = (' '.join(df_explore[df_explore.language == 'English']['lemma']))
typescript_words = (' '.join(df_explore[df_explore.language == 'TypeScript']['lemma']))
go_words = (' '.join(df_explore[df_explore.language == 'Go']['lemma']))
java_words = (' '.join(df_explore[df_explore.language == 'Java']['lemma']))
c_plus_words = (' '.join(df_explore[df_explore.language == 'C++']['lemma']))
c_words = (' '.join(df_explore[df_explore.language == 'C']['lemma']))
html_words = (' '.join(df_explore[df_explore.language == 'HTML']['lemma']))
shell_words = (' '.join(df_explore[df_explore.language == 'Shell']['lemma']))
rust_words = (' '.join(df_explore[df_explore.language == 'Rust']['lemma']))
swift_words = (' '.join(df_explore[df_explore.language == 'Ruby']['lemma']))
ruby_words = (' '.join(df_explore[df_explore.language == 'Swift']['lemma']))
jupyter_words = (' '.join(df_explore[df_explore.language == 'PHP']['lemma']))
c_sharp_words = (' '.join(df_explore[df_explore.language == 'Jupyter Notebook']['lemma']))
css_words = (' '.join(df_explore[df_explore.language == 'C#']['lemma']))
kotlin_words = (' '.join(df_explore[df_explore.language == 'CSS']['lemma']))
php_words = (' '.join(df_explore[df_explore.language == 'Kotlin']['lemma']))

In [36]:
javascript_words = list(javascript_words.split(' '))
python_words = list(python_words.split(' '))
english_words = list(english_words.split(' '))
typescript_words = list(typescript_words.split(' '))
go_words = list(go_words.split(' '))
java_words = list(java_words.split(' '))
c_plus_words = list(c_plus_words.split(' '))
c_words = list(c_words.split(' '))
html_words = list(html_words.split(' '))
shell_words = list(shell_words.split(' '))
rust_words = list(rust_words.split(' '))
swift_words = list(swift_words.split(' '))
ruby_words = list(ruby_words.split(' '))
jupyter_words = list(jupyter_words.split(' '))
c_sharp_words = list(c_sharp_words.split(' '))
css_words = list(css_words.split(' '))
kotlin_words = list(kotlin_words.split(' '))
php_words = list(php_words.split(' '))

In [37]:
# python_words = list(python_words.split(' '))

In [38]:
javascript_freq = pd.Series(javascript_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
typescript_freq = pd.Series(typescript_words).value_counts()
go_freq = pd.Series(go_words).value_counts()
java_freq = pd.Series(java_words).value_counts()
c_plus_freq = pd.Series(c_plus_words).value_counts()
c_freq = pd.Series(c_words).value_counts()
html_freq = pd.Series(html_words).value_counts()
shell_freq = pd.Series(shell_words).value_counts()
rust_freq = pd.Series(rust_words).value_counts()
swift_freq = pd.Series(swift_words).value_counts()
ruby_freq = pd.Series(ruby_words).value_counts()
jupyter_freq = pd.Series(jupyter_words).value_counts()
c_sharp_freq = pd.Series(c_sharp_words).value_counts()
css_freq = pd.Series(css_words).value_counts()
kotlin_freq = pd.Series(kotlin_words).value_counts()
php_freq = pd.Series(php_words).value_counts()


In [40]:
word_counts = (pd.concat([javascript_freq, python_freq, typescript_freq, go_freq, java_freq, c_plus_freq, c_freq, html_freq, shell_freq, rust_freq, swift_freq, ruby_freq, jupyter_freq, c_sharp_freq, css_freq, kotlin_freq, php_freq], axis=1, sort=True)
                .set_axis(['javascript', 'python', 'typescript', 'go', 'java', 'c_plus', 'c', 'html', 'shell', 'rust', 'swift', 'ruby', 'jupyter', 'c_sharp', 'css', 'kotlin', 'php'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.head()

Unnamed: 0,javascript,python,typescript,go,java,c_plus,c,html,shell,rust,swift,ruby,jupyter,c_sharp,css,kotlin,php
,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
&#9;,803,162,872,953,161,127,48,71,1709,0,668,73,21,197,18,11,0
0,304,196,31,136,18,18,6,27,18,16,0,1,14,3,1,5,1
00,6,37,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
000,7,2,0,0,2,0,0,5,0,0,0,0,0,0,0,0,0


In [41]:
word_counts

Unnamed: 0,javascript,python,typescript,go,java,c_plus,c,html,shell,rust,swift,ruby,jupyter,c_sharp,css,kotlin,php
,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
&#9;,803,162,872,953,161,127,48,71,1709,0,668,73,21,197,18,11,0
0,304,196,31,136,18,18,6,27,18,16,0,1,14,3,1,5,1
00,6,37,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
000,7,2,0,0,2,0,0,5,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zyte_,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
zythumbnailtableview,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
zzcompanies,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
zzindividuals,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
