In [139]:
import pandas as pd
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display
from ipywidgets import interact

In [2]:
# Env vars
DATA_PATH_2018 = "2018.csv"
DATA_PATH_2019 = "2019.csv"
DATA_PATH_2020 = "2020.csv"
DATA_PATH_2021 = "2021.csv"
DATA_PATH_2022 = "2022.csv"


SEP = ','
COL_NAMES = ["RepoID",
            "Name",
            "Type",
            "Topics",
            "Visibility",
            "Language",
            "Published",
            "Last_Modified",
            "Stars",
            "Forks",
            "WatchCount",
            "NetworkCount",
            "IssueCount",
            "PRCount",
            "ProjectsCount",
            "BranchCount",
            "DownloadCount",
            "ContributorCount",
            "RepoURL"]

In [3]:
# Pull CSV to DF
data_2018 = pd.read_csv(DATA_PATH_2018, sep=SEP, names=COL_NAMES)
data_2019 = pd.read_csv(DATA_PATH_2019, sep=SEP, names=COL_NAMES)
data_2020 = pd.read_csv(DATA_PATH_2020, sep=SEP, names=COL_NAMES)
data_2021 = pd.read_csv(DATA_PATH_2021, sep=SEP, names=COL_NAMES)
data_2022 = pd.read_csv(DATA_PATH_2022, sep=SEP, names=COL_NAMES)

In [4]:
# SANITIZING $

# Remove duplicates
data_2018.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2018['Published'] = pd.to_datetime(data_2018['Published'])

# extract year from the 'Published' column
data_2018['Published'] = data_2018['Published'].dt.year

# Fill NaN's
data_2018['Published'].fillna(0).astype(int)
data_2018['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2018.dropna(subset=['Language'], inplace=True)

In [5]:
# Remove duplicates
data_2019.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2019['Published'] = pd.to_datetime(data_2019['Published'])

# extract year from the 'Published' column
data_2019['Published'] = data_2019['Published'].dt.year

# Fill NaN's
data_2019['Published'].fillna(0).astype(int)
data_2019['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2019.dropna(subset=['Language'], inplace=True)

In [6]:
# Remove duplicates
data_2020.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2020['Published'] = pd.to_datetime(data_2020['Published'])

# extract year from the 'Published' column
data_2020['Published'] = data_2020['Published'].dt.year

# Fill NaN's
data_2020['Published'].fillna(0).astype(int)
data_2020['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2020.dropna(subset=['Language'], inplace=True)

In [7]:
# Remove duplicates
data_2021.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2021['Published'] = pd.to_datetime(data_2021['Published'])

# extract year from the 'Published' column
data_2021['Published'] = data_2021['Published'].dt.year

# Fill NaN's
data_2021['Published'].fillna(0).astype(int)
data_2021['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2021.dropna(subset=['Language'], inplace=True)

In [8]:
# Remove duplicates
data_2022.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2022['Published'] = pd.to_datetime(data_2022['Published'])

# extract year from the 'Published' column
data_2022['Published'] = data_2022['Published'].dt.year

# Fill NaN's
data_2022['Published'].fillna(0).astype(int)
data_2022['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2022.dropna(subset=['Language'], inplace=True)

In [196]:
data_all = pd.concat([data_2018, data_2019, data_2020, data_2021, data_2022])

In [197]:

# View Language With the most of its attribute for a specific year or over the course of 2018-2022

data_options = ["Stars", 
                "Forks", 
                "WatchCount", 
                "NetworkCount", 
                "IssueCount", 
                "PRCount", 
                "ProjectsCount",
                "BranchCount",
                "ContributorCount"]

quantity_options = ["Language", 5, 10]


@interact(Top=quantity_options, 
          Year=list(i_data['Published'].unique()) + ['All'], 
          Most=data_options)
def filter_dataframe(Top, Year, Most):
    
    i_data = data_all
    
    if Year != 'All':
        i_data = i_data[i_data['Published'] == Year] # filter rows for specific year
    
    grouped = i_data.groupby('Language')[Most].sum().reset_index() # group by language and sum stars
        
    sorted_grouped = grouped.sort_values(by=Most, ascending=False).reset_index(drop=True)[['Language', Most]] # sort by stars in descending order
    sorted_grouped.index += 1
    
    if Top == 5:
        return sorted_grouped.iloc[:5]
    elif Top == 10:
        return sorted_grouped.iloc[:10]
    else:
        return sorted_grouped.iloc[[0]]



interactive(children=(Dropdown(description='Top', options=('Language', 5, 10), value='Language'), Dropdown(des…