In [1]:
import nltk
import fasttext
from markdown import markdown
import re
from bs4 import BeautifulSoup
import pandas as pd


class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = 'lid.176.bin'
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=1) # returns top matching language
        return predictions

lang_detector = LanguageIdentification()


def removeHTML(html_markdown_string):
    """ Converts a markdown string to plaintext """
    # first, remove code blocks:
    mkd = re.sub(r"```.*?```",'', html_markdown_string)

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(mkd)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "lxml")
    text = ''.join(soup.findAll(text=True))

    # remove urls, matched empty parethesis, remaining tags and whitespaces
    text = re.sub(r"\s*(?:https?://)?www\.\S*\.[A-Za-z]{2,5}\s*", "", text)
    text = re.sub(r"\( {0,2}\)","",text)
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
    text = re.sub('\s{1,}', " ", text)


    return text


def clean_text_column(text_column: pd.Series):
    text_column = text_column.values.tolist()
    # (1) Remove code blocks, markdown / html tags etc..
    cleaned_html = [removeHTML(text) for text in text_column]
    # (2) Tokenize text in sentences and detect language.
    tokenized_sentences = [nltk.sent_tokenize(text) for text in cleaned_html]
    sentence_language = [[lang_detector.predict_lang(sentence)[0][0] for sentence in text] for text in tokenized_sentences]
    # (3) Count proportion of english text.
    sentences_len = [len(text) for text in tokenized_sentences]
    english_sentences = [sum([sentence.count('__label__en') for sentence in text]) for text in sentence_language]
    english_language = [eng_lang/count if count != 0 else 0 for eng_lang, count in zip(english_sentences, sentences_len)]
    # (4) Join sentences and remove non ascii characters.
    clean_sentences = [' '.join(sentences) for sentences in tokenized_sentences]
    clean_sentences = ([' '.join(sentences.split()) for sentences in clean_sentences])
    clean_sentences = [sentence.encode('ascii', 'ignore').decode('utf-8', 'ignore') for sentence in clean_sentences]
    # (5) Return clean text and proportion of english language.
    return clean_sentences, english_language




In [2]:
columns = ['Stars','Forks','Releases','Description','Readme','Languages','Licence','Name','Domain','Contents', 'Status']
df = pd.read_csv('expanded_frame_final.csv', usecols=columns)
df

Unnamed: 0,Stars,Forks,Description,Readme,Languages,Licence,Releases,Status,Contents,Name,Domain
0,323943.0,25410.0,freeCodeCamp.org's open source codebase and cu...,![freeCodeCamp.org Social Banner](https://s3.a...,"['JavaScript', 'CSS', 'Shell', 'HTML', 'EJS', ...","BSD 3-Clause ""New"" or ""Revised"" License",0.0,True,"['.dockerignore', '.editorconfig', '.eslintign...",freeCodeCamp/freeCodeCamp,Documentation
1,150012.0,71296.0,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n <a href=""https://getboot...","['JavaScript', 'HTML', 'SCSS', 'CSS', 'PowerSh...",MIT License,69.0,True,"['.babelrc.js', '.browserslistrc', '.bundlewat...",twbs/bootstrap,Web libraries and frameworks
2,96800.0,22341.0,"Bring data to life with SVG, Canvas and HTML. ...","# D3: Data-Driven Documents\n\n<a href=""https:...",['JavaScript'],"BSD 3-Clause ""New"" or ""Revised"" License",167.0,True,"['.gitignore', 'API.md', 'CHANGES.md', 'ISSUE_...",d3/d3,Web libraries and frameworks
3,168294.0,33061.0,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) &middot; [![Gi...,"['JavaScript', 'HTML', 'CSS', 'C++', 'TypeScri...",MIT License,96.0,True,"['.circleci', '.codesandbox', '.editorconfig',...",facebook/react,Web libraries and frameworks
4,59608.0,28135.0,AngularJS - HTML enhanced for web apps!,AngularJS [![CircleCI](https://circleci.com/gh...,"['JavaScript', 'HTML', 'Shell', 'PHP', 'CSS']",MIT License,0.0,True,"['.circleci', '.editorconfig', '.eslintignore'...",angular/angular.js,Web libraries and frameworks
...,...,...,...,...,...,...,...,...,...,...,...
4995,1850.0,349.0,This project aims to provide a working page fl...,# Changes:\n\n * Made clickable views like a ...,['Java'],,0.0,True,"['.gitignore', 'README.md', 'build.gradle', 'f...",Yalantis/FlipViewPager.Draco,Non-web libraries and frameworks
4996,1780.0,178.0,JavaScript Client-Side Cookie Manipulation Lib...,# Cookies.js\n\nCookies.js is a small client-s...,"['JavaScript', 'CSS', 'HTML']",The Unlicense,10.0,True,"['.gitignore', 'CHANGELOG.md', 'README.md', 'U...",ScottHamper/Cookies,Web libraries and frameworks
4997,3138.0,300.0,The earliest versions of the very first c comp...,legacy-cc\n=========\n\nThe earliest versions ...,"['C', 'Assembly']",,0.0,True,"['Caldera-license.pdf', 'README.md', 'last1120...",mortdeus/legacy-cc,Software tools
4998,1668.0,441.0,A clone of the UIImagePickerController using t...,# ELCImagePickerController\n\n*A clone of the ...,"['Objective-C', 'Ruby']",,1.0,True,"['.gitignore', 'Classes', 'Default-568h@2x.png...",B-Sides/ELCImagePickerController,Non-web libraries and frameworks


In [3]:
clean_sentences, english_language = clean_text_column(df['Readme'].astype(str))
df = pd.concat([df,pd.DataFrame({'clean_readme':clean_sentences, 'english_readme':english_language})],axis=1)

In [4]:
clean_sentences, english_language = clean_text_column(df['Description'].astype(str))
df = pd.concat([df,pd.DataFrame({'clean_description':clean_sentences, 'english_description':english_language})],axis=1)

In [5]:
df.to_csv('GHDomains_cleaned.csv', index=False)

1. Drop repositories which Status == False, meaning they cant't be reached anymore.

In [6]:
len(df[df['Status']==False])

36

2. Drop repositories which Status == True and english language is less than 0.5. Threshold fixed after manual inspection. 

In [7]:
len(df[(df['english_readme'] < 0.5) & (df['Status'] == True)])

168

3. We should drop remaining repositories without enough info, such as README, Description or which content as been replaced by messages such as 'Deprecated, No longer maintained...'