In [1]:
import pandas as pd
from markdown import Markdown
from io import StringIO
from html.parser import HTMLParser
import re

In [2]:
def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def unmark(text):
    return __md.convert(text)


class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def clean_text(text):
    # delete codeblocks
    text = re.sub(r"```.*?```", " ", text, 0, re.DOTALL) # count=0 means all occurrences will be replaced.
    # delete markdown tags
    text = unmark(text)
    # delete html tags
    text = strip_tags(text)
    # delete non ascii char
    text = text.encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # delete extra whitespaces
    text = re.sub(r'\s+', ' ', text,0, re.DOTALL)
    text = re.sub('\s{2,}', ' ', text, 0, re.DOTALL)

    return text

In [3]:
df = pd.read_csv('GHDomains_cleaned.csv')
df


Unnamed: 0,Stars,Forks,Description,Readme,Languages,Licence,Releases,Status,Contents,Name,Domain,clean_readme,english_readme,clean_description,english_description
0,323943.0,25410.0,freeCodeCamp.org's open source codebase and cu...,![freeCodeCamp.org Social Banner](https://s3.a...,"['JavaScript', 'CSS', 'Shell', 'HTML', 'EJS', ...","BSD 3-Clause ""New"" or ""Revised"" License",0.0,True,"['.dockerignore', '.editorconfig', '.eslintign...",freeCodeCamp/freeCodeCamp,Documentation,freeCodeCamp.org's open-source codebase and cu...,1.000000,freeCodeCamp.org's open source codebase and cu...,1.0
1,150012.0,71296.0,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n <a href=""https://getboot...","['JavaScript', 'HTML', 'SCSS', 'CSS', 'PowerSh...",MIT License,69.0,True,"['.babelrc.js', '.browserslistrc', '.bundlewat...",twbs/bootstrap,Web libraries and frameworks,"Bootstrap Sleek, intuitive, and powerful front...",0.979592,"The most popular HTML, CSS, and JavaScript fra...",1.0
2,96800.0,22341.0,"Bring data to life with SVG, Canvas and HTML. ...","# D3: Data-Driven Documents\n\n<a href=""https:...",['JavaScript'],"BSD 3-Clause ""New"" or ""Revised"" License",167.0,True,"['.gitignore', 'API.md', 'CHANGES.md', 'ISSUE_...",d3/d3,Web libraries and frameworks,D3: Data-Driven Documents D3 (or D3.js) is a J...,1.000000,"Bring data to life with SVG, Canvas and HTML. ...",1.0
3,168294.0,33061.0,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) &middot; [![Gi...,"['JavaScript', 'HTML', 'CSS', 'C++', 'TypeScri...",MIT License,96.0,True,"['.circleci', '.codesandbox', '.editorconfig',...",facebook/react,Web libraries and frameworks,React React is a JavaScript library for build...,1.000000,"A declarative, efficient, and flexible JavaScr...",1.0
4,59608.0,28135.0,AngularJS - HTML enhanced for web apps!,AngularJS [![CircleCI](https://circleci.com/gh...,"['JavaScript', 'HTML', 'Shell', 'PHP', 'CSS']",MIT License,0.0,True,"['.circleci', '.editorconfig', '.eslintignore'...",angular/angular.js,Web libraries and frameworks,AngularJS AngularJS lets you write client-side...,1.000000,AngularJS - HTML enhanced for web apps!,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1850.0,349.0,This project aims to provide a working page fl...,# Changes:\n\n * Made clickable views like a ...,['Java'],,0.0,True,"['.gitignore', 'README.md', 'build.gradle', 'f...",Yalantis/FlipViewPager.Draco,Non-web libraries and frameworks,Changes: Made clickable views like a button cl...,0.937500,This project aims to provide a working page fl...,1.0
4996,1780.0,178.0,JavaScript Client-Side Cookie Manipulation Lib...,# Cookies.js\n\nCookies.js is a small client-s...,"['JavaScript', 'CSS', 'HTML']",The Unlicense,10.0,True,"['.gitignore', 'CHANGELOG.md', 'README.md', 'U...",ScottHamper/Cookies,Web libraries and frameworks,Cookies.js Cookies.js is a small client-side j...,1.000000,JavaScript Client-Side Cookie Manipulation Lib...,1.0
4997,3138.0,300.0,The earliest versions of the very first c comp...,legacy-cc\n=========\n\nThe earliest versions ...,"['C', 'Assembly']",,0.0,True,"['Caldera-license.pdf', 'README.md', 'last1120...",mortdeus/legacy-cc,Software tools,legacy-cc The earliest versions of the very fi...,1.000000,The earliest versions of the very first c comp...,1.0
4998,1668.0,441.0,A clone of the UIImagePickerController using t...,# ELCImagePickerController\n\n*A clone of the ...,"['Objective-C', 'Ruby']",,1.0,True,"['.gitignore', 'Classes', 'Default-568h@2x.png...",B-Sides/ELCImagePickerController,Non-web libraries and frameworks,ELCImagePickerController A clone of the UIImag...,1.000000,A clone of the UIImagePickerController using t...,1.0


In [4]:
df['clean_readme'] = df['Readme'].astype('str').apply(clean_text)

In [5]:
df

Unnamed: 0,Stars,Forks,Description,Readme,Languages,Licence,Releases,Status,Contents,Name,Domain,clean_readme,english_readme,clean_description,english_description
0,323943.0,25410.0,freeCodeCamp.org's open source codebase and cu...,![freeCodeCamp.org Social Banner](https://s3.a...,"['JavaScript', 'CSS', 'Shell', 'HTML', 'EJS', ...","BSD 3-Clause ""New"" or ""Revised"" License",0.0,True,"['.dockerignore', '.editorconfig', '.eslintign...",freeCodeCamp/freeCodeCamp,Documentation,freeCodeCamp.org's open-source codebase and cu...,1.000000,freeCodeCamp.org's open source codebase and cu...,1.0
1,150012.0,71296.0,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n <a href=""https://getboot...","['JavaScript', 'HTML', 'SCSS', 'CSS', 'PowerSh...",MIT License,69.0,True,"['.babelrc.js', '.browserslistrc', '.bundlewat...",twbs/bootstrap,Web libraries and frameworks,"Bootstrap Sleek, intuitive, and powerful fron...",0.979592,"The most popular HTML, CSS, and JavaScript fra...",1.0
2,96800.0,22341.0,"Bring data to life with SVG, Canvas and HTML. ...","# D3: Data-Driven Documents\n\n<a href=""https:...",['JavaScript'],"BSD 3-Clause ""New"" or ""Revised"" License",167.0,True,"['.gitignore', 'API.md', 'CHANGES.md', 'ISSUE_...",d3/d3,Web libraries and frameworks,D3: Data-Driven Documents D3 (or D3.js) is a J...,1.000000,"Bring data to life with SVG, Canvas and HTML. ...",1.0
3,168294.0,33061.0,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) &middot; [![Gi...,"['JavaScript', 'HTML', 'CSS', 'C++', 'TypeScri...",MIT License,96.0,True,"['.circleci', '.codesandbox', '.editorconfig',...",facebook/react,Web libraries and frameworks,React React is a JavaScript library for buildi...,1.000000,"A declarative, efficient, and flexible JavaScr...",1.0
4,59608.0,28135.0,AngularJS - HTML enhanced for web apps!,AngularJS [![CircleCI](https://circleci.com/gh...,"['JavaScript', 'HTML', 'Shell', 'PHP', 'CSS']",MIT License,0.0,True,"['.circleci', '.editorconfig', '.eslintignore'...",angular/angular.js,Web libraries and frameworks,AngularJS AngularJS lets you write client-side...,1.000000,AngularJS - HTML enhanced for web apps!,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1850.0,349.0,This project aims to provide a working page fl...,# Changes:\n\n * Made clickable views like a ...,['Java'],,0.0,True,"['.gitignore', 'README.md', 'build.gradle', 'f...",Yalantis/FlipViewPager.Draco,Non-web libraries and frameworks,Changes: Made clickable views like a button cl...,0.937500,This project aims to provide a working page fl...,1.0
4996,1780.0,178.0,JavaScript Client-Side Cookie Manipulation Lib...,# Cookies.js\n\nCookies.js is a small client-s...,"['JavaScript', 'CSS', 'HTML']",The Unlicense,10.0,True,"['.gitignore', 'CHANGELOG.md', 'README.md', 'U...",ScottHamper/Cookies,Web libraries and frameworks,Cookies.js Cookies.js is a small client-side j...,1.000000,JavaScript Client-Side Cookie Manipulation Lib...,1.0
4997,3138.0,300.0,The earliest versions of the very first c comp...,legacy-cc\n=========\n\nThe earliest versions ...,"['C', 'Assembly']",,0.0,True,"['Caldera-license.pdf', 'README.md', 'last1120...",mortdeus/legacy-cc,Software tools,legacy-cc The earliest versions of the very fi...,1.000000,The earliest versions of the very first c comp...,1.0
4998,1668.0,441.0,A clone of the UIImagePickerController using t...,# ELCImagePickerController\n\n*A clone of the ...,"['Objective-C', 'Ruby']",,1.0,True,"['.gitignore', 'Classes', 'Default-568h@2x.png...",B-Sides/ELCImagePickerController,Non-web libraries and frameworks,ELCImagePickerController A clone of the UIImag...,1.000000,A clone of the UIImagePickerController using t...,1.0


In [6]:
df['clean_description'] = df['Description'].astype('str').apply(clean_text)
df

Unnamed: 0,Stars,Forks,Description,Readme,Languages,Licence,Releases,Status,Contents,Name,Domain,clean_readme,english_readme,clean_description,english_description
0,323943.0,25410.0,freeCodeCamp.org's open source codebase and cu...,![freeCodeCamp.org Social Banner](https://s3.a...,"['JavaScript', 'CSS', 'Shell', 'HTML', 'EJS', ...","BSD 3-Clause ""New"" or ""Revised"" License",0.0,True,"['.dockerignore', '.editorconfig', '.eslintign...",freeCodeCamp/freeCodeCamp,Documentation,freeCodeCamp.org's open-source codebase and cu...,1.000000,freeCodeCamp.org's open source codebase and cu...,1.0
1,150012.0,71296.0,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n <a href=""https://getboot...","['JavaScript', 'HTML', 'SCSS', 'CSS', 'PowerSh...",MIT License,69.0,True,"['.babelrc.js', '.browserslistrc', '.bundlewat...",twbs/bootstrap,Web libraries and frameworks,"Bootstrap Sleek, intuitive, and powerful fron...",0.979592,"The most popular HTML, CSS, and JavaScript fra...",1.0
2,96800.0,22341.0,"Bring data to life with SVG, Canvas and HTML. ...","# D3: Data-Driven Documents\n\n<a href=""https:...",['JavaScript'],"BSD 3-Clause ""New"" or ""Revised"" License",167.0,True,"['.gitignore', 'API.md', 'CHANGES.md', 'ISSUE_...",d3/d3,Web libraries and frameworks,D3: Data-Driven Documents D3 (or D3.js) is a J...,1.000000,"Bring data to life with SVG, Canvas and HTML. ...",1.0
3,168294.0,33061.0,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) &middot; [![Gi...,"['JavaScript', 'HTML', 'CSS', 'C++', 'TypeScri...",MIT License,96.0,True,"['.circleci', '.codesandbox', '.editorconfig',...",facebook/react,Web libraries and frameworks,React React is a JavaScript library for buildi...,1.000000,"A declarative, efficient, and flexible JavaScr...",1.0
4,59608.0,28135.0,AngularJS - HTML enhanced for web apps!,AngularJS [![CircleCI](https://circleci.com/gh...,"['JavaScript', 'HTML', 'Shell', 'PHP', 'CSS']",MIT License,0.0,True,"['.circleci', '.editorconfig', '.eslintignore'...",angular/angular.js,Web libraries and frameworks,AngularJS AngularJS lets you write client-side...,1.000000,AngularJS - HTML enhanced for web apps!,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1850.0,349.0,This project aims to provide a working page fl...,# Changes:\n\n * Made clickable views like a ...,['Java'],,0.0,True,"['.gitignore', 'README.md', 'build.gradle', 'f...",Yalantis/FlipViewPager.Draco,Non-web libraries and frameworks,Changes: Made clickable views like a button cl...,0.937500,This project aims to provide a working page fl...,1.0
4996,1780.0,178.0,JavaScript Client-Side Cookie Manipulation Lib...,# Cookies.js\n\nCookies.js is a small client-s...,"['JavaScript', 'CSS', 'HTML']",The Unlicense,10.0,True,"['.gitignore', 'CHANGELOG.md', 'README.md', 'U...",ScottHamper/Cookies,Web libraries and frameworks,Cookies.js Cookies.js is a small client-side j...,1.000000,JavaScript Client-Side Cookie Manipulation Lib...,1.0
4997,3138.0,300.0,The earliest versions of the very first c comp...,legacy-cc\n=========\n\nThe earliest versions ...,"['C', 'Assembly']",,0.0,True,"['Caldera-license.pdf', 'README.md', 'last1120...",mortdeus/legacy-cc,Software tools,legacy-cc The earliest versions of the very fi...,1.000000,The earliest versions of the very first c comp...,1.0
4998,1668.0,441.0,A clone of the UIImagePickerController using t...,# ELCImagePickerController\n\n*A clone of the ...,"['Objective-C', 'Ruby']",,1.0,True,"['.gitignore', 'Classes', 'Default-568h@2x.png...",B-Sides/ELCImagePickerController,Non-web libraries and frameworks,ELCImagePickerController A clone of the UIImag...,1.000000,A clone of the UIImagePickerController using t...,1.0


In [7]:
df.to_csv('GHDomains.csv', index=False)