In [None]:
"""
Assumes CSV files are in the same directory
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from collections import Counter

# Env vars
DATA_PATH_2018 = "../../generic_repos/collect_data/2018.csv"
DATA_PATH_2019 = "../../generic_repos/collect_data/2019.csv"
DATA_PATH_2020 = "../../generic_repos/collect_data/2020.csv"
DATA_PATH_2021 = "../../generic_repos/collect_data/2021.csv"
DATA_PATH_2022 = "../../generic_repos/collect_data/2022.csv"

SEP = ','
COL_NAMES = ["RepoID",
             "Name",
             "Type",
             "Topics",
             "Visibility",
             "Language",
             "Published",
             "Last_Modified",
             "Stars",
             "Forks",
             "WatchCount",
             "NetworkCount",
             "IssueCount",
             "PRCount",
             "ProjectsCount",
             "BranchCount",
             "DownloadCount",
             "ContributorCount",
             "RepoURL"]
# Pull CSV to DF
data_2018 = pd.read_csv(DATA_PATH_2018, sep=SEP, names=COL_NAMES)
data_2019 = pd.read_csv(DATA_PATH_2019, sep=SEP, names=COL_NAMES)
data_2020 = pd.read_csv(DATA_PATH_2020, sep=SEP, names=COL_NAMES)
data_2021 = pd.read_csv(DATA_PATH_2021, sep=SEP, names=COL_NAMES)
data_2022 = pd.read_csv(DATA_PATH_2022, sep=SEP, names=COL_NAMES)
# SANITIZING $

# Remove duplicates
data_2018.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2018['Published'] = pd.to_datetime(data_2018['Published'])

# extract year from the 'Published' column
data_2018['Published'] = data_2018['Published'].dt.year

# Fill NaN's
data_2018['Published'].fillna(0).astype(int)
data_2018['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2018.dropna(subset=['Language'], inplace=True)
# Remove duplicates
data_2019.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2019['Published'] = pd.to_datetime(data_2019['Published'])

# extract year from the 'Published' column
data_2019['Published'] = data_2019['Published'].dt.year

# Fill NaN's
data_2019['Published'].fillna(0).astype(int)
data_2019['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2019.dropna(subset=['Language'], inplace=True)
# Remove duplicates
data_2020.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2020['Published'] = pd.to_datetime(data_2020['Published'])

# extract year from the 'Published' column
data_2020['Published'] = data_2020['Published'].dt.year

# Fill NaN's
data_2020['Published'].fillna(0).astype(int)
data_2020['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2020.dropna(subset=['Language'], inplace=True)
# Remove duplicates
data_2021.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2021['Published'] = pd.to_datetime(data_2021['Published'])

# extract year from the 'Published' column
data_2021['Published'] = data_2021['Published'].dt.year

# Fill NaN's
data_2021['Published'].fillna(0).astype(int)
data_2021['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2021.dropna(subset=['Language'], inplace=True)
# Remove duplicates
data_2022.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_2022['Published'] = pd.to_datetime(data_2022['Published'])

# extract year from the 'Published' column
data_2022['Published'] = data_2022['Published'].dt.year

# Fill NaN's
data_2022['Published'].fillna(0).astype(int)
data_2022['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_2022.dropna(subset=['Language'], inplace=True)
data_all = pd.concat([data_2018, data_2019, data_2020, data_2021, data_2022])

In [None]:
# convert column to list(list())
# use ast to parse the string literal topics into a list
topics = data_all['Topics'].apply(lambda x: ast.literal_eval(x)).tolist()

# split all lists and select individual items
tags = [tag for topic in topics for tag in topic]

# Tally how much each tag appears
tags_count = Counter(tags)

# create a dataframe with most common tags
top_tags_df = pd.DataFrame(tags_count.most_common(10), columns=['TagName','Count'])

# set figure size
fig, ax = plt.subplots(figsize=(7, 4))

# Turn labels on x-axis
plt.xticks(rotation=65)

# Get rid of grid and turn bg white
ax.grid(False)
ax.set_facecolor('white')
sns.despine()

# plot
sns.barplot(data=top_tags_df, x='TagName', y='Count', palette='gnuplot');


# axis labels and title
ax.set_xlabel('Topics', fontsize=13, color = '#3d704a')
ax.set_ylabel('Count', fontsize=13, color = '#3d704a')
fig.suptitle('Most popular topics',fontsize=18, color = '#3d704a');