In [1]:
import pandas as pd
from sqlalchemy.orm import sessionmaker
from src.db.database import connect
from src.config.states import *
from src.db.database import Repository

In [2]:
with connect() as session:
    repositories = pd.read_sql_table("repositories", session.connection())
    total = len(repositories)
    print("A total of {} repositories were retrieved ".format(total))

A total of 327016 repositories were retrieved 


In [3]:
display(repositories.state.value_counts())

repository_discarded              160149
repository_selected               127107
repository_filtered                27826
repository_finished_processing     11368
requirement_files_extracted          543
error_cloning_repository              23
Name: state, dtype: int64

In [4]:
selected = repositories[repositories.state==REP_SELECTED].copy()
selected["disk_usage"] = selected["disk_usage"].astype(int)

print(
    "Disk Usage for the {} repositories is estimated to be:\n"
    "\033[92m{} KB - {:.2f} MB - {:.2f} GB - {:.2f} TB\033[0m\n"
    .format(
        len(selected),
        selected.disk_usage.sum(),
        selected.disk_usage.sum() / 10 ** 3,
        selected.disk_usage.sum() / 10 ** 6,
        selected.disk_usage.sum() / 10 ** 9
    )
)

partitions = 3
print("Spliting in {} partitions, each will process {:.2f} GB".format(partitions, selected.disk_usage.sum()/ 10 ** 6 / 3))

Disk Usage for the 127107 repositories is estimated to be:
[92m1787572222 KB - 1787572.22 MB - 1787.57 GB - 1.79 TB[0m

Spliting in 3 partitions, each will process 595.86 GB


In [5]:
selected['cumulative_sum'] = selected['disk_usage'].cumsum()
partition_size = selected.disk_usage.sum() / 3
split_points = [partition_size, partition_size * 2]

selected_part1 = selected[selected['cumulative_sum'] <= split_points[0]].copy()
selected_part2 = selected[(selected['cumulative_sum'] > split_points[0]) & (selected['cumulative_sum'] <= split_points[1])].copy()
selected_part3 = selected[selected['cumulative_sum'] > split_points[1]].copy()
display(selected_part1, selected_part2, selected_part3)

Unnamed: 0,id,state,domain,repository,extraction_id,primary_language,disk_usage,is_mirror,git_created_at,git_pushed_at,...,has_next_page,notebooks_count,python_files_count,setups_count,requirements_count,pipfiles_count,pipfile_locks_count,created_at,updated_at,cumulative_sum
96,97,repository_selected,github.com,kevinbluer/data-science,,Python,253868,False,2013-03-24 06:12:29,2013-05-06 06:09:40,...,True,,,,,,,2023-03-27 17:04:15.884547,2023-05-22 11:40:24.809165,253868
778,779,repository_selected,github.com,andrew-reece/datascience,,Python,357656,False,2014-01-12 19:16:02,2014-12-16 21:16:17,...,True,,,,,,,2023-03-27 18:19:51.869968,2023-05-22 11:40:24.809165,611524
4338,4339,repository_selected,github.com,jschear/cs1951a-final,,Python,271380,False,2014-04-28 17:40:17,2014-05-10 05:59:07,...,True,,,,,,,2023-03-27 17:21:25.109645,2023-05-22 11:40:24.809165,882904
27709,27710,repository_selected,github.com,sarahwalters/ess,,Python,1142872,False,2015-03-28 06:11:58,2015-09-16 04:31:43,...,True,,,,,,,2023-03-27 19:42:04.296089,2023-05-22 11:40:24.809165,2025776
38205,38206,repository_selected,github.com,vvivek92/DataScience-Learning,,Jupyter Notebook,370185,False,2015-08-20 16:25:37,2016-02-26 12:13:18,...,True,,,,,,,2023-03-27 20:14:36.951807,2023-05-22 11:40:24.809165,2395961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156058,156059,repository_selected,github.com,chaitanyahardikar/Data_Science_ig_week2,,Jupyter Notebook,44,False,2020-04-27 23:03:13,2020-04-27 23:05:25,...,True,,,,,,,2023-03-28 13:09:01.008988,2023-05-22 11:40:24.809165,595442645
156064,156065,repository_selected,github.com,ldanuwinata/ETL-Data-Science,,Jupyter Notebook,5503,False,2020-04-27 23:25:00,2020-05-02 22:47:54,...,True,,,,,,,2023-03-28 13:13:27.450092,2023-05-22 11:40:24.809165,595448148
156068,156069,repository_selected,github.com,Abbe94/Data-Science-Project,,Python,533,False,2020-04-27 23:33:40,2020-04-27 23:37:15,...,True,,,,,,,2023-03-28 13:09:01.215433,2023-05-22 11:40:24.809165,595448681
156070,156071,repository_selected,github.com,learn-co-students/dsc-data-science-processes-n...,,Jupyter Notebook,674,False,2020-04-27 23:37:51,2020-08-10 18:09:31,...,True,,,,,,,2023-03-28 14:47:55.899465,2023-05-22 11:40:24.809165,595449355


Unnamed: 0,id,state,domain,repository,extraction_id,primary_language,disk_usage,is_mirror,git_created_at,git_pushed_at,...,has_next_page,notebooks_count,python_files_count,setups_count,requirements_count,pipfiles_count,pipfile_locks_count,created_at,updated_at,cumulative_sum
156074,156075,repository_selected,github.com,DSAISummerCamp/dscamp_public,,Jupyter Notebook,664967,False,2020-04-27 23:48:05,2023-03-19 16:09:34,...,False,,,,,,,2023-03-26 23:47:43.923418,2023-05-22 11:40:24.809165,596114330
156076,156077,repository_selected,github.com,tiomnenkiy/skillbox_DS,,Jupyter Notebook,197,False,2020-04-27 23:48:52,2020-04-27 23:49:22,...,True,,,,,,,2023-03-28 13:09:01.328264,2023-05-22 11:40:24.809165,596114527
156080,156081,repository_selected,github.com,sinchita-siddiquee/Vancouver-Airbnb-Price-Anal...,,Jupyter Notebook,72088,False,2020-04-28 00:52:31,2020-04-28 01:56:40,...,True,,,,,,,2023-03-28 13:09:01.602743,2023-05-22 11:40:24.809165,596186615
156083,156084,repository_selected,github.com,sunshine-coder/AppliedDataScienceCapstone,,Jupyter Notebook,5208,False,2020-04-28 01:24:53,2020-04-28 11:47:42,...,True,,,,,,,2023-03-28 13:09:13.544397,2023-05-22 11:40:24.809165,596191823
156085,156086,repository_selected,github.com,mc-hung/Dubstech-Data-Science-Workshops-2020-S...,,Jupyter Notebook,1063,False,2020-04-28 02:05:04,2020-05-13 07:10:55,...,True,,,,,,,2023-03-28 13:22:51.337171,2023-05-22 11:40:24.809165,596192886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237955,237956,repository_selected,github.com,nastya236/Sleep-monitoring-with-sensors-analysis,,Jupyter Notebook,3117,False,2021-09-27 15:54:00,2021-10-25 00:51:23,...,True,,,,,,,2023-03-28 20:05:27.276129,2023-05-22 11:40:24.809165,1191665248
237956,237957,repository_selected,github.com,meleerosa/data_science_practice,,Jupyter Notebook,202,False,2021-09-27 15:58:52,2021-11-28 11:08:05,...,True,,,,,,,2023-03-28 20:26:40.996059,2023-05-22 11:40:24.809165,1191665450
237957,237958,repository_selected,github.com,RiKjess/SoloLearnPythonDataScience,,Python,10,False,2021-09-27 15:59:36,2021-10-19 11:05:22,...,True,,,,,,,2023-03-28 20:01:50.427549,2023-05-22 11:40:24.809165,1191665460
237958,237959,repository_selected,github.com,pstumpo3/Data-Science-Project,,Jupyter Notebook,390,False,2021-09-27 16:16:25,2021-09-27 21:58:37,...,True,,,,,,,2023-03-28 19:47:12.417598,2023-05-22 11:40:24.809165,1191665850


Unnamed: 0,id,state,domain,repository,extraction_id,primary_language,disk_usage,is_mirror,git_created_at,git_pushed_at,...,has_next_page,notebooks_count,python_files_count,setups_count,requirements_count,pipfiles_count,pipfile_locks_count,created_at,updated_at,cumulative_sum
237963,237964,repository_selected,github.com,lillaszulyovszky/data-science-retreat-lectures,,Python,420786,False,2021-09-27 16:35:04,2022-03-17 13:05:13,...,True,,,,,,,2023-03-28 21:47:52.589889,2023-05-22 11:40:24.809165,1192097277
237965,237966,repository_selected,github.com,mbhsmlclub/Lecture-1-Intro-to-Data-Science-in-...,,Jupyter Notebook,579,False,2021-09-27 16:56:34,2021-10-11 03:39:01,...,True,,,,,,,2023-03-28 19:55:59.131510,2023-05-22 11:40:24.809165,1192097856
237966,237967,repository_selected,github.com,herbmks/thesis_gan_fraud_scenarios,,Python,2687,False,2021-09-27 16:56:44,2021-09-27 17:20:09,...,True,,,,,,,2023-03-28 19:47:01.252125,2023-05-22 11:40:24.809165,1192100543
237967,237968,repository_selected,github.com,GhofraneAyari/DataScienceChallenge,,Jupyter Notebook,1915,False,2021-09-27 17:09:58,2022-08-28 17:54:33,...,True,,,,,,,2023-03-28 23:49:43.410048,2023-05-22 11:40:24.809165,1192102458
237969,237970,repository_selected,github.com,lperozzi/DST1_pydeck_visual,,Jupyter Notebook,29807,False,2021-09-27 17:16:13,2021-10-04 10:06:36,...,True,,,,,,,2023-03-28 19:51:36.687529,2023-05-22 11:40:24.809165,1192132265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327001,327002,repository_selected,github.com,idealidler/Enhancing-Public-Safety-in-Philadel...,,Jupyter Notebook,3425,False,2023-03-29 02:26:15,2023-03-29 03:00:41,...,False,,,,,,,2023-03-29 01:19:46.221925,2023-05-22 11:40:24.809165,1787562904
327002,327003,repository_selected,github.com,sheetalsattiraju/DataScienceEcosystem,,Jupyter Notebook,7,False,2023-03-29 02:31:46,2023-03-29 02:45:58,...,False,,,,,,,2023-03-29 01:19:45.695252,2023-05-22 11:40:24.809165,1787562911
327003,327004,repository_selected,github.com,CJ7MO/Data-Science-Projects,,Jupyter Notebook,9244,False,2023-03-29 02:37:16,2023-03-29 02:43:49,...,False,,,,,,,2023-03-29 01:19:45.599180,2023-05-22 11:40:24.809165,1787572155
327006,327007,repository_selected,github.com,BBERNUIA/DataScienceEcosystem,,Jupyter Notebook,1,False,2023-03-29 03:05:08,2023-03-29 03:06:23,...,False,,,,,,,2023-03-29 01:19:46.507108,2023-05-22 11:40:24.809165,1787572156


In [6]:
import os
from src.db.database import Base
from src.config.consts import DB_DIR
from sqlalchemy import create_engine
engine1 = create_engine("sqlite:////{}part1_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)
engine2 = create_engine("sqlite:////{}part2_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)
engine3 = create_engine("sqlite:////{}part3_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)
Base.metadata.create_all(engine1)
Base.metadata.create_all(engine2)
Base.metadata.create_all(engine3);

  engine1 = create_engine("sqlite:////{}part1_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)
  engine2 = create_engine("sqlite:////{}part2_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)
  engine3 = create_engine("sqlite:////{}part3_dsmining.sqlite".format(DB_DIR + os.sep), convert_unicode=True, echo=False)


In [7]:
part1_data = selected_part1.to_dict(orient='records')
Session1 = sessionmaker(bind=engine1)
session1 = Session1()
session1.execute("INSERT INTO sqlite_sequence (name, seq) VALUES "
                 "('cell_data_ios', 100000000000),"
                 "('cell_markdown_features', 100000000000),"
                 "('cell_modules', 100000000000),"
                 "('cells', 100000000000),"
                 "('commits', 100000000000),"
                 "('data_ios', 100000000000),"
                 "('extractions', 100000000000),"
                 "('modules', 100000000000),"
                 "('notebook_markdowns', 100000000000),"
                 "('notebooks', 100000000000),"
                 "('python_file_data_ios', 100000000000),"
                 "('python_file_modules', 100000000000),"
                 "('python_files', 100000000000),"
                 "('requirement_files', 100000000000);")
session1.bulk_insert_mappings(Repository, part1_data)
session1.commit()
session1.close()

In [8]:
part2_data = selected_part2.to_dict(orient='records')
Session2 = sessionmaker(bind=engine2)
session2 = Session2()
session2.execute("INSERT INTO sqlite_sequence (name, seq) VALUES "
                 "('cell_data_ios', 300000000000),"
                 "('cell_markdown_features', 300000000000),"
                 "('cell_modules', 300000000000),"
                 "('cells', 300000000000),"
                 "('commits', 300000000000),"
                 "('data_ios', 300000000000),"
                 "('extractions', 300000000000),"
                 "('modules', 300000000000),"
                 "('notebook_markdowns', 300000000000),"
                 "('notebooks', 300000000000),"
                 "('python_file_data_ios', 300000000000),"
                 "('python_file_modules', 300000000000),"
                 "('python_files', 300000000000),"
                 "('requirement_files', 300000000000);")
session2.bulk_insert_mappings(Repository, part2_data)
session2.commit()

In [9]:
selected_part3.loc[selected_part3['id'] == 326419, 'git_pushed_at'] = '2023-03-27 11:06:07'
selected_part3.loc[selected_part3['id'] == 326458, 'git_pushed_at'] = '2023-03-26 23:11:42'
selected_part3.loc[selected_part3['id'] == 326481, 'git_pushed_at'] = '2023-03-27 00:43:24'
part3_data = selected_part3.to_dict(orient='records')
Session3 = sessionmaker(bind=engine3)
session3 = Session3()
session3.execute("INSERT INTO sqlite_sequence (name, seq) VALUES "
                 "('cell_data_ios', 50000000000),"
                 "('cell_markdown_features', 50000000000),"
                 "('cell_modules', 50000000000),"
                 "('cells', 50000000000),"
                 "('commits', 50000000000),"
                 "('data_ios', 50000000000),"
                 "('extractions', 50000000000),"
                 "('modules', 50000000000),"
                 "('notebook_markdowns', 50000000000),"
                 "('notebooks', 50000000000),"
                 "('python_file_data_ios', 50000000000),"
                 "('python_file_modules', 50000000000),"
                 "('python_files', 50000000000),"
                 "('requirement_files', 50000000000);")
session3.bulk_insert_mappings(Repository, part3_data)
session3.commit()