In [2]:
# imports for required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

from sqlalchemy import create_engine

In [3]:
# load datased as pandas dataframe
df = pd.read_csv('TopStaredRepositories.csv')

In [4]:
# print first 5 rows of dataset
df.head(5)

Unnamed: 0,Username,Repository Name,Description,Last Update Date,Language,Number of Stars,Tags,Url,Gravatar
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,2017-06-24T15:56:17Z,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp,https://avatars0.githubusercontent.com/u/98925...
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",2017-06-24T15:40:21Z,JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap,https://avatars0.githubusercontent.com/u/29185...
2,EbookFoundation,free-programming-books,Freely available programming books,2017-06-23T01:09:34Z,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...,https://avatars0.githubusercontent.com/u/14127...
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",2017-06-24T19:33:49Z,JavaScript,69.7k,,https://github.com/facebook/react,https://avatars3.githubusercontent.com/u/69631...
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",2017-05-31T06:03:47Z,JavaScript,65.7k,visualization,https://github.com/d3/d3,https://avatars1.githubusercontent.com/u/15627...


In [5]:
# print number of rows and columns in dataset, 980 and 9 respectively
df.shape

(980, 9)

In [6]:
# list of columns to import into db file
columns = ['Repository Name', 'Description', 'Language', 'Url', 'Tags']

df[columns].head(5)

Unnamed: 0,Repository Name,Description,Language,Url,Tags
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,https://github.com/freeCodeCamp/freeCodeCamp,"nonprofits,certification,curriculum,react,node..."
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,https://github.com/twbs/bootstrap,"javascript,css,html,bootstrap,jekyll-site,scss"
2,free-programming-books,Freely available programming books,,https://github.com/EbookFoundation/free-progra...,"education,list,books,resource"
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,https://github.com/facebook/react,
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,https://github.com/d3/d3,visualization


In [7]:
df_important = df[columns]
df_important.fillna('', inplace=True)
df_important["Timestamp"] = "2020-08-20 22:41:12.177510"

df_important.head(5)

Unnamed: 0,Repository Name,Description,Language,Url,Tags,Timestamp
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,https://github.com/freeCodeCamp/freeCodeCamp,"nonprofits,certification,curriculum,react,node...",2020-08-20 22:41:12.177510
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,https://github.com/twbs/bootstrap,"javascript,css,html,bootstrap,jekyll-site,scss",2020-08-20 22:41:12.177510
2,free-programming-books,Freely available programming books,,https://github.com/EbookFoundation/free-progra...,"education,list,books,resource",2020-08-20 22:41:12.177510
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,https://github.com/facebook/react,,2020-08-20 22:41:12.177510
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,https://github.com/d3/d3,visualization,2020-08-20 22:41:12.177510


In [8]:
df_important.rename(columns={"Repository Name": "title", "Description": "description", "Language": "language", "Tags": "tags", "Url": "git_url", "Timestamp": "timestamp"}, inplace=True)
df_important[['tags']] = df_important[['tags']].applymap(lambda x: ' '.join(x.split(',')))
df_important[['tags']] = df_important[['tags']].applymap(lambda x: ''.join(x.split('-')))
df_important['user_id'] = np.arange(6, len(df_important) + 6)
df_important.head(5)

Unnamed: 0,title,description,language,git_url,tags,timestamp,user_id
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,https://github.com/freeCodeCamp/freeCodeCamp,nonprofits certification curriculum react node...,2020-08-20 22:41:12.177510,6
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,https://github.com/twbs/bootstrap,javascript css html bootstrap jekyllsite scss,2020-08-20 22:41:12.177510,7
2,free-programming-books,Freely available programming books,,https://github.com/EbookFoundation/free-progra...,education list books resource,2020-08-20 22:41:12.177510,8
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,https://github.com/facebook/react,,2020-08-20 22:41:12.177510,9
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,https://github.com/d3/d3,visualization,2020-08-20 22:41:12.177510,10


In [9]:
engine = create_engine('sqlite:///app.db')

In [10]:
#df_important.to_sql(con=engine, name='project', if_exists='append', index=False, )

In [12]:
df_with_user = df_important
df_with_user['user_id'] = 6
df_with_user.head(5)

Unnamed: 0,title,description,language,git_url,tags,timestamp,user_id
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,https://github.com/freeCodeCamp/freeCodeCamp,nonprofits certification curriculum react node...,2020-08-20 22:41:12.177510,6
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,https://github.com/twbs/bootstrap,javascript css html bootstrap jekyllsite scss,2020-08-20 22:41:12.177510,6
2,free-programming-books,Freely available programming books,,https://github.com/EbookFoundation/free-progra...,education list books resource,2020-08-20 22:41:12.177510,6
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,https://github.com/facebook/react,,2020-08-20 22:41:12.177510,6
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,https://github.com/d3/d3,visualization,2020-08-20 22:41:12.177510,6


In [13]:
df_with_user.to_sql(con=engine, name='project', if_exists='append', index=False, )