# PLAN

- [x] Acquisition
    - [x] Select what list of repos to scrape.
    - [x] Get requests from the site.
    - [x] Save responses to csv.
- [x] Preparation
    - [x] Prepare the data for analysis.
- [ ] Exploration
    - [ ] Answer the following prompts:
        - [ ] What are the most common words in READMEs?
        - [ ] What does the distribution of IDFs look like for the most common words?
        - [ ] Does the length of the README vary by language?
        - [ ] Do different languages use a different number of unique words?
- [ ] Modeling
    - [ ] Transform the data for machine learning; use language to predict.
    - [ ] Fit several models using different text repressentations.
    - [ ] Build a function that will take in the text of a README file, and makes a prediction of language.
- [ ] Delivery
    - [ ] Github repo
        - [x] This notebook.
        - [ ] Documentation within the notebook.
        - [ ] README file in the repo.
        - [ ] Python scripts if applicable.
    - [ ] Google Slides
        - [ ] 1-2 slides only summarizing analysis.
        - [ ] Visualizations are labeled.
        - [ ] Geared for the general audience.
        - [ ] Share link @ readme file and/or classroom.

# ENVIRONMENT

In [1]:
import os
import sys

import pandas as pd
import re
import json
import unicodedata
import nltk
import spacy

from requests import get
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

ADDITIONAL_STOPWORDS = ['readme', '\n\n\n', '-PRON-']

# ACQUIRE

In [2]:
# We have decided to search Github for "san antonio data" and scrape the results.
# https://github.com/open-austin

In [3]:
def get_github_repo(url):
    """
    This function takes a url and returns a dictionary that
    contains the content and language of the readme file.
    """
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    readme = soup.find('div', id='readme')
    language = soup.find('span', class_='lang')
    
    d = dict()
    if readme is None:
        d['readme'] = 'No readme file.'
    else:
        d['readme'] = readme.text
    if language is None:
        d['language'] = 'No language specified.'
    else:
        d['language'] = language.text
    return d

In [4]:
# # This line to test out the function.
# get_github_repo('https://github.com/open-austin/atx-citysdk-js')

In [5]:
def get_github_links(url):
    """
    This function takes in a url and returns a list of links
    that comes from each individual repo listing page.
    """
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for link in soup.findAll('a', itemprop='name codeRepository', attrs={'href': re.compile("^/")}):
        links.append(link.get('href'))
    return links

In [6]:
# # This line to test out the function.
# get_github_links('https://github.com/open-austin?page=3')

In [7]:
def get_all_github_links(path, num_pages):
    """
    This function takes in a url path and number of pages
    and returns a list of lists of all links.
    """
    all_links = []
    for i in range(num_pages):      # Number of pages plus one
        page = i + 1
        response = get(path + str(page))
        soup = BeautifulSoup(response.text, 'html.parser')
        all_links.append(get_github_links(path + '?page=' + str(page)))
    return all_links

In [8]:
# # This line to test out the function.
# get_all_github_links('https://github.com/open-austin', 3)

In [9]:
def traverse(o, tree_types=(list, tuple)):
    if isinstance(o, tree_types):
        for value in o:
            for subvalue in traverse(value, tree_types):
                yield subvalue
    else:
        yield o

In [10]:
def get_github_readme(url, num_pages, cache=True):
    if cache and os.path.exists('github_readme.json'):
        readme_text = json.load(open('github_readme.json'))
    else:
        data = get_all_github_links(url, num_pages)
        readme_text = []
        for value in traverse(data):
            print('https://github.com'+value)
            readme_text.append(get_github_repo('https://github.com' + value))
        json.dump(readme_text, open('github_readme.json', 'w'))
    return readme_text

In [11]:
# Bringing it all together chaining...
corpus = get_github_readme('https://github.com/texastribune', 8, cache=True)
corpus

[{'readme': "\n\n\n\n        README.md\n      \n\n\nBase images\nHow to make updates:\n\nCreate a new branch\nIf you're adding a python dependency:\n\nRun make run-base\nRun poetry add --dev <package> (drop the --dev if it's a production\ndependency)\nFor other operations see the\npoetry docs\nMaybe edit pyproject.toml by hand if necessary\nRun poetry lock\n\n\nIf it's a node dependency:\n\nRun make run-dev\nDo whatever node/yarn things you people do ;-)\n\n\nBump the version in VERSION file\nBump version in dev/Dockerfile\nCommit your changes\nOpen a pull request; if necessary\nCommit and tag it make tag\nMerge it to master\nDelete the branch\nPush\nUpdate child projects to use this new version\n\n\n\n",
  'language': 'Dockerfile'},
 {'readme': "\n\n\n\n        README.md\n      \n\n\nSoftware to collect donations for nonprofits. It integrates with Saleforce, Stripe, Amazon Pay, Slack and Sentry.\nDonations\n\nPython running Flask\nsupports single and recurring donations\neasily deploy

# PREPARE

In [25]:
def basic_clean(original):
    word = original.lower()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    return word

def tokenize(original):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(basic_clean(original))

def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

def lemmatize(original):
    nlp = spacy.load('en', parse=True, tag=True, entity=True)
    doc = nlp(original) # process the text with spacy
    lemmas = [word.lemma_ for word in doc]
    original_lemmatized = ' '.join(lemmas)
    return original_lemmatized

def remove_stopwords(original, extra_words=['readmemd'], exclude_words=[]):
    tokenizer = ToktokTokenizer()

    stopword_list = stopwords.words('english') + ADDITIONAL_STOPWORDS

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    original_nostop = ' '.join(filtered_words)

    return original_nostop

def prep_article(article):
    
    article_stemmed = stem(basic_clean(article['readme']))
    article_lemmatized = lemmatize(article_stemmed)
    article_without_stopwords = remove_stopwords(article_lemmatized)
    
    article['stemmed'] = article_stemmed
    article['lemmatized'] = article_lemmatized
    article['clean'] = article_without_stopwords
    
    return article

def prepare_article_data(corpus):
    transformed  = []
    for article in corpus:
        transformed.append(prep_article(article))
    return transformed

# This is to fix the string as list of words per readme file glitch
def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [26]:
df = pd.DataFrame(prepare_article_data(corpus))
df.shape

Removed 33 stopwords
---
Removed 165 stopwords
---
Removed 908 stopwords
---
Removed 227 stopwords
---
Removed 64 stopwords
---
Removed 16 stopwords
---
Removed 11 stopwords
---
Removed 1 stopwords
---
Removed 1 stopwords
---
Removed 1 stopwords
---
Removed 21 stopwords
---
Removed 88 stopwords
---
Removed 245 stopwords
---
Removed 1164 stopwords
---
Removed 8 stopwords
---
Removed 102 stopwords
---
Removed 497 stopwords
---
Removed 1 stopwords
---
Removed 1 stopwords
---
Removed 37 stopwords
---
Removed 475 stopwords
---
Removed 290 stopwords
---
Removed 311 stopwords
---
Removed 1 stopwords
---
Removed 32 stopwords
---
Removed 16 stopwords
---
Removed 21 stopwords
---
Removed 8 stopwords
---
Removed 26 stopwords
---
Removed 44 stopwords
---
Removed 159 stopwords
---
Removed 119 stopwords
---
Removed 47 stopwords
---
Removed 109 stopwords
---
Removed 132 stopwords
---
Removed 4 stopwords
---
Removed 157 stopwords
---
Removed 225 stopwords
---
Removed 115 stopwords
---
Removed 36 stopw

(211, 5)

In [27]:
df = df[['clean', 'language']]
# remove_stopwords(df.iloc[11].clean) - ZACH'S DIAGNOSTIC TEST

In [28]:
languages = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
languages.columns = ['n', 'ratio']
languages

Unnamed: 0,n,ratio
Python,68,0.322275
JavaScript,60,0.28436
No language specified.,20,0.094787
CSS,20,0.094787
HTML,14,0.066351
Shell,13,0.061611
Makefile,5,0.023697
Dockerfile,5,0.023697
Ruby,3,0.014218
Jupyter Notebook,2,0.009479


In [29]:
# removing all rows that has 'No language specified.'
df = df[df.language != 'No language specified.']
df = df.rename(index=str, columns={"clean": "text"})

In [30]:
df.shape

(191, 2)

In [31]:
languages = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
languages.columns = ['n', 'ratio']
languages

Unnamed: 0,n,ratio
Python,68,0.356021
JavaScript,60,0.314136
CSS,20,0.104712
HTML,14,0.073298
Shell,13,0.068063
Makefile,5,0.026178
Dockerfile,5,0.026178
Ruby,3,0.015707
Jupyter Notebook,2,0.010471
CoffeeScript,1,0.005236


# EXPLORE

In [32]:
python_words = clean(' '.join(df[df.language == 'Python'].text))

js_words = clean(' '.join(df[df.language == 'JavaScript'].text))
css_words = clean(' '.join(df[df.language == 'CSS'].text))
html_words = clean(' '.join(df[df.language == 'HTML'].text))
shell_words = clean(' '.join(df[df.language == 'Shell'].text))
docker_words = clean(' '.join(df[df.language == 'Docker'].text))
maker_words = clean(' '.join(df[df.language == 'Makefile'].text))
ruby_words = clean(' '.join(df[df.language == 'Ruby'].text))
jupyter_words = clean(' '.join(df[df.language == 'Jupyter Notebook'].text))
coffee_words = clean(' '.join(df[df.language == 'CoffeeScript'].text))

all_words = clean(' '.join(df.text))

In [33]:
python_words

['softwar',
 'collect',
 'donat',
 'nonprofit',
 'integr',
 'saleforc',
 'stripe',
 'amazon',
 'pay',
 'slack',
 'sentri',
 'donat',
 'python',
 'run',
 'flask',
 'support',
 'singl',
 'recur',
 'donat',
 'easili',
 'deploy',
 'heroku',
 'get',
 'start',
 'recommend',
 'method',
 'run',
 'thi',
 'repo',
 'local',
 'use',
 'docker',
 'alreadi',
 'docker',
 'set',
 'want',
 'instal',
 'docker',
 'mac',
 'get',
 'docker',
 'environ',
 'set',
 'comput',
 'also',
 'need',
 'env',
 'file',
 'set',
 'environ',
 'variabl',
 'stripe',
 'salesforc',
 'docker',
 'find',
 'default',
 'makefil',
 'look',
 'envdock',
 'thi',
 'override',
 'dockerenvfil',
 'environ',
 'variabl',
 'also',
 'instal',
 'precommit',
 'use',
 'manag',
 'git',
 'hook',
 'includ',
 'j',
 'format',
 'via',
 'pretty',
 'onc',
 'download',
 'run',
 'precommit',
 'instal',
 'root',
 'thi',
 'repo',
 'also',
 'need',
 'node',
 'version',
 '8',
 'requir',
 'python',
 '36',
 'see',
 'requirementstxt',
 'devrequirementstxt',
 'spec

In [34]:
python_freq = pd.Series(python_words).value_counts()
js_freq = pd.Series(js_words).value_counts()
css_freq = pd.Series(css_words).value_counts()
html_freq = pd.Series(html_words).value_counts()
shell_freq = pd.Series(shell_words).value_counts()
docker_freq = pd.Series(docker_words).value_counts()
maker_freq = pd.Series(maker_words).value_counts()
ruby_freq = pd.Series(ruby_words).value_counts()
jupyter_freq = pd.Series(jupyter_words).value_counts()
coffee_freq = pd.Series(coffee_words).value_counts()

all_freq = pd.Series(all_words).value_counts()

python_freq.head()

thi       242
use       178
instal    134
django     98
set        89
dtype: int64

# MODEL