In [1]:
"""
Assumes CSV files are in the same directory
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from collections import Counter

In [2]:
# Env vars
DATA_PATH_FACEBOOK = "facebook.csv"
DATA_PATH_AMAZON = "amazon.csv"
DATA_PATH_APPLE = "apple.csv"
DATA_PATH_MICROSOFT = "microsoft.csv"

SEP = ','
COL_NAMES = ["RepoID",
            "Name",
            "Type",
            "Topics",
            "Visibility",
            "Language",
            "Published",
            "Last_Modified",
            "Stars",
            "Forks",
            "WatchCount",
            "NetworkCount",
            "IssueCount",
            "PRCount",
            "ProjectsCount",
            "BranchCount",
            "DownloadCount",
            "ContributorCount",
            "RepoURL"]

In [3]:
# Pull CSV to DF
data_facebook = pd.read_csv(DATA_PATH_FACEBOOK, sep=SEP, names=COL_NAMES)
data_amazon = pd.read_csv(DATA_PATH_AMAZON, sep=SEP, names=COL_NAMES)
data_apple = pd.read_csv(DATA_PATH_APPLE, sep=SEP, names=COL_NAMES)
data_microsoft = pd.read_csv(DATA_PATH_MICROSOFT, sep=SEP, names=COL_NAMES)

In [4]:
# SANITIZING $

# Remove duplicates
data_facebook.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_facebook['Published'] = pd.to_datetime(data_facebook['Published'])

# extract year from the 'Published' column
data_facebook['Published'] = data_facebook['Published'].dt.year

# Fill NaN's
data_facebook['Published'].fillna(0).astype(int)
data_facebook['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_facebook.dropna(subset=['Language'], inplace=True)

In [5]:
# Remove duplicates
data_amazon.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_amazon['Published'] = pd.to_datetime(data_amazon['Published'])

# extract year from the 'Published' column
data_amazon['Published'] = data_amazon['Published'].dt.year

# Fill NaN's
data_amazon['Published'].fillna(0).astype(int)
data_amazon['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_amazon.dropna(subset=['Language'], inplace=True)

In [8]:
# Remove duplicates
data_apple.drop_duplicates(inplace=True)

# convert 'Published' column to datetime data_apple
data_apple['Published'] = pd.to_datetime(data_apple['Published'])

# extract year from the 'Published' column
data_apple['Published'] = data_apple['Published'].dt.year

# Fill NaN's
data_apple['Published'].fillna(0).astype(int)
data_apple['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_apple.dropna(subset=['Language'], inplace=True)

In [9]:
# Remove duplicates
data_microsoft.drop_duplicates(inplace=True)

# convert 'Published' column to datetime format
data_microsoft['Published'] = pd.to_datetime(data_microsoft['Published'])

# extract year from the 'Published' column
data_microsoft['Published'] = data_microsoft['Published'].dt.year

# Fill NaN's
data_microsoft['Published'].fillna(0).astype(int)
data_microsoft['ProjectsCount'].fillna(0).astype(int)

# Drop rows with no language val since that is what we care about
data_microsoft.dropna(subset=['Language'], inplace=True)

In [10]:
data_all = pd.concat([data_facebook, data_amazon, data_apple, data_microsoft])

In [11]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5989 entries, 623564659 to 1932083
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   RepoID            5989 non-null   object
 1   Name              5989 non-null   object
 2   Type              5989 non-null   object
 3   Topics            5989 non-null   object
 4   Visibility        5180 non-null   object
 5   Language          5989 non-null   object
 6   Published         5989 non-null   int64 
 7   Last_Modified     5989 non-null   object
 8   Stars             5989 non-null   int64 
 9   Forks             5989 non-null   int64 
 10  WatchCount        5989 non-null   int64 
 11  NetworkCount      5989 non-null   int64 
 12  IssueCount        5989 non-null   int64 
 13  PRCount           5989 non-null   int64 
 14  ProjectsCount     5989 non-null   int64 
 15  BranchCount       5989 non-null   int64 
 16  DownloadCount     5989 non-null   int64 
 17  Con