## Project Imports

In [1]:
import pandas as pd

#acquire and prep
from env import github_token, github_username
import acquire
import prepare

import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

#visualize
from wordcloud import WordCloud
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

#ignore warnings (turn off pink warning boxes)
import warnings
warnings.filterwarnings("ignore")

#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#train, validate, test
from sklearn.model_selection import train_test_split

#creating / evaluating models
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# K-Nearest Neighbor(KNN)  
from sklearn.neighbors import KNeighborsClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, accuracy_score

# Data Acquisition

In [2]:
"""
A module for obtaining repo readme and language data from the github API.

Before using this module, read through it, and follow the instructions marked
TODO.

After doing so, run it like this:

    python acquire.py

To create the `data.json` file that contains the data.
"""
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = [
    "freeCodeCamp/freeCodeCamp",
    "vuejs/vue",
    "facebook/react",
    "tensorflow/tensorflow",
    "twbs/bootstrap",
    "donnemartin/system-design-primer",
    "ohmyzsh/ohmyzsh",
    "public-apis/public-apis",
    "microsoft/vscode",
    "torvalds/linux",
    "airbnb/javascript",
    "trekhleb/javascript-algorithms",
    "TheAlgorithms/Python",
    "d3/d3",
    "facebook/react-native",
    "ytdl-org/youtube-dl",
    "electron/electron",
    "axios/axios",
    "facebook/create-react-app",
    "nodejs/node",
    "kubernetes/kubernetes",
    "30-seconds/30-seconds-of-code",
    "microsoft/terminal",
    "tensorflow/models",
    "vercel/next.js",
    "iluwatar/java-design-patterns",
    "FortAwesome/Font-Awesome",
    "goldbergyoni/nodebestpractices",
    "laravel/laravel",
    "nvbn/thefuck",
    "atom/atom",
    "spring-projects/spring-boot",
    "elastic/elasticsearch",
    "jquery/jquery",
    "microsoft/PowerToys",
    "opencv/opencv",
    "typicode/json-server",
    "netdata/netdata",
    "keras-team/keras",
    "chrislgarry/Apollo-11",
    "httpie/httpie",
    "josephmisiti/awesome-machine-learning",
    "h5bp/html5-boilerplate",
    "lodash/lodash",
    "Semantic-Org/Semantic-UI",
    "h5bp/Front-end-Developer-Interview-Questions",
    "redis/redis",
    "yangshun/tech-interview-handbook",
    "chartjs/Chart.js",
    "socketio/socket.io",
    "bitcoin/bitcoin",
    "ionic-team/ionic-framework",
    "necolas/normalize.css",
    "ReactTraining/react-router",
    "huggingface/transformers",
    "scikit-learn/scikit-learn",
    "moment/moment",
    "psf/requests",
    "ReactiveX/RxJava",
    "impress/impress.js",
    "mermaid-js/mermaid",
    "Alamofire/Alamofire",
    "serverless/serverless",
    "prettier/prettier",
    "juliangarnier/anime",
    "godotengine/godot",
    "ColorlibHQ/AdminLTE",
    "apache/superset",
    "parcel-bundler/parcel",
    "square/retrofit",
    "spring-projects/spring-framework",
    "jekyll/jekyll",
    "home-assistant/core",
    "meteor/meteor",
    "jaywcjlove/awesome-mac",
    "grafana/grafana",
    "NARKOZ/hacker-scripts",
    "tailwindlabs/tailwindcss",
    "syncthing/syncthing",
    "strapi/strapi",
    "apache/dubbo",
    "deepfakes/faceswap",
    "iamkun/dayjs",
    "mozilla/pdf.js",
    "python/cpython",
    "vsouza/awesome-ios",
    "TryGhost/Ghost",
    "hexojs/hexo",
    "gulpjs/gulp",
    "alvarotrigo/fullPage.js",
    "Marak/faker.js",
    "fastlane/fastlane",
    "NationalSecurityAgency/ghidra",
    "beego/beego",
    "jashkenas/underscore",
    "skylot/jadx",
    "agalwood/Motrix",
    "pingcap/tidb",
    "bayandin/awesome-awesomeness",
    "microsoft/playwright",
    "go-gorm/gorm",
    "iview/iview",
    "cheeriojs/cheerio",
    "mobxjs/mobx",
    "GitbookIO/gitbook",
    "anuraghazra/github-readme-stats",
    "ryanoasis/nerd-fonts",
    "google-research/bert",
    "bumptech/glide",
    "airbnb/lottie-android",
    "immutable-js/immutable-js",
    "tiangolo/fastapi",
    "jondot/awesome-react-native",
    "Blankj/AndroidUtilCode",
    "FFmpeg/FFmpeg",
    "ctripcorp/apollo",
    "typescript-cheatsheets/react",
    "sherlock-project/sherlock",
    "gorhill/uBlock",
    "PowerShell/PowerShell"

]

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)


In [3]:
#Acquire data from acquire.py
df = pd.read_json('data.json')
df.head()

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,JavaScript,![freeCodeCamp.org Social Banner](https://s3.a...
1,vuejs/vue,JavaScript,"<p align=""center""><a href=""https://vuejs.org"" ..."
2,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...
3,tensorflow/tensorflow,C++,"<div align=""center"">\n <img src=""https://www...."
4,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot..."


In [4]:
#Columns and rows
df.shape

(120, 3)

In [5]:
#DF info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             120 non-null    object
 1   language         120 non-null    object
 2   readme_contents  120 non-null    object
dtypes: object(3)
memory usage: 2.9+ KB


In [6]:
#Value counts of languages
df.language.value_counts()

JavaScript    53
Python        19
Java          13
C++            6
TypeScript     6
Go             5
C              4
Ruby           3
Swift          2
CSS            2
C#             2
Assembly       1
Nunjucks       1
PHP            1
Vue            1
Shell          1
Name: language, dtype: int64

In [7]:
#Check for null values
df.isnull().sum()

repo               0
language           0
readme_contents    0
dtype: int64

# Data Preparation

In [8]:
#Get clean DF from prepare.py
df = prepare.clean_content(df, 'readme_contents', extra_words = ['p', 'aligncenter', 'img'], exclude_words = ['no'])

#Drop original readme_contents
df = df.drop(columns = ['readme_contents'])

df.head()

Unnamed: 0,repo,clean_content,language
0,freeCodeCamp/freeCodeCamp,freecodecamporg social bannerhttpss3amazonawsc...,JavaScript
1,vuejs/vue,aligncentera hrefhttpsvuejsorg targetblank rel...,JavaScript
2,facebook/react,reacthttpsreactjsorg middot github licensehttp...,JavaScript
3,tensorflow/tensorflow,div srchttpswwwtensorfloworgimagestflogosocial...,C++
4,twbs/bootstrap,hrefhttpsgetbootstrapcom srchttpsgetbootstrapc...,JavaScript


In [9]:
#Check to see if languages with fewer than 5 occurances were dropped
len(df.language.value_counts())

6

In [10]:
#Check how many words appear for each language and percentage
languages = pd.concat([df.language.value_counts(),
                    round(df.language.value_counts(normalize=True), 2)], axis=1)

languages.columns = ['n', 'percent']

languages

Unnamed: 0,n,percent
JavaScript,53,0.52
Python,19,0.19
Java,13,0.13
C++,6,0.06
TypeScript,6,0.06
Go,5,0.05


In [11]:
#Shape of prepped DF
df.shape

(102, 3)

# Data Exploration

In [12]:
#Breaking up data into each language

#words that appear in Python
python_words = ' '.join(df[df.language == 'Python'].clean_content)

#words that appear in JavaScript
javascript_words = ' '.join(df[df.language == 'JavaScript'].clean_content)

#words that appear in Java
java_words = ' '.join(df[df.language == 'Java'].clean_content)

#words that appear in C++
c_words = ' '.join(df[df.language == 'C++'].clean_content)

#words that appear in TypeScript
typescript_words = ' '.join(df[df.language == 'TypeScript'].clean_content)

#words that appear in Go
go_words = ' '.join(df[df.language == 'Go'].clean_content)

#all of the words 
all_lang_words = ' '.join(df.clean_content)

In [13]:
#Check how often each of the words occurs
python_freq = pd.Series(python_words.split()).value_counts()
javascript_freq = pd.Series(javascript_words.split()).value_counts()
java_freq = pd.Series(java_words.split()).value_counts()
c_freq = pd.Series(c_words.split()).value_counts()
typescript_freq = pd.Series(typescript_words.split()).value_counts()
go_freq = pd.Series(go_words.split()).value_counts()
all_lang_freq = pd.Series(all_lang_words.split()).value_counts()

print('Python:')
print(python_freq)
print('--------------------------')
print('JavaScript:')
print(javascript_freq)
print('--------------------------')
print('Java:')
print(java_freq)
print('--------------------------')
print('C:')
print(c_freq)
print('--------------------------')
print('TypeScript:')
print(typescript_freq)
print('--------------------------')
print('Go:')
print(go_freq)
print('--------------------------')
print('All Languages:')
print(all_lang_freq)

Python:
yes                                                               915
unknown                                                           594
no                                                                579
data                                                              464
learning                                                          396
                                                                 ... 
demonstration                                                       1
bloghttpsengineeringherokucom                                       1
httpsstoragegoogleapiscombertmodels20200220uncasedl2h768a12zip      1
jsdelivrhttpsgithubcomjsdelivrdatajsdelivrcom                       1
emceehttpsgithubcomdfmemcee                                         1
Length: 11513, dtype: int64
--------------------------
JavaScript:
react                          1031
native                          860
const                           498
icon                            442
use        

In [15]:
#Return one DF by combining all freq together
word_counts = (pd.concat([all_lang_freq, 
                          python_freq, 
                          javascript_freq, 
                          java_freq, 
                          c_freq, 
                          typescript_freq,
                          go_freq], axis=1, sort=True)
                .set_axis(['All', 
                           'Python', 
                           'JavaScript', 
                           'Java', 
                           'C', 
                           'TypeScript',
                           'Go'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32223 entries, &#9; to zyszyshttpsgithubcomzyszys
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   All         32223 non-null  int64
 1   Python      32223 non-null  int64
 2   JavaScript  32223 non-null  int64
 3   Java        32223 non-null  int64
 4   C           32223 non-null  int64
 5   TypeScript  32223 non-null  int64
 6   Go          32223 non-null  int64
dtypes: int64(7)
memory usage: 2.0+ MB
