In [2]:
import json

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.types import *
import pyspark.sql.types as spark_types

import utils

spark = SparkSession.builder.master("local").appName("GH Users").getOrCreate()

In [6]:
df = utils.read_csv(spark, path="/mnt/Data/GHTorrent/users.csv")
users = df

# df.count()

# Users who have perfectly mapped location data
df2 = df.filter(df.country_code.isNotNull())
# df2.count()

# Users who have data in location field but which couldn't be mapped to a
# country etc.
# df3 = df.filter(df.country_code.isNull() & df.location.isNotNull())

# df3.count()

# Such users from India?
# df4 = df3.filter(df.location.rlike("\\bIndia\\b"))

### Companies with most GitHub users 

In [None]:
# People who have a company
not_really_companies = ["NA", "N/A", "None", "none", "-", "Personal", "Student", "student", "self", "Self", "Home", "Freelance", "Freelancer"]
df_company = (
    df
    .where(
        (df.company.isNotNull()) 
        & ~ (df.company.isin(not_really_companies)) 
        & ~ (df.company.contains("CLICK "))
    )
)

# People who claim to be from India
df5 = df_company.filter(df.location.rlike("\\bIndia\\b"))

# Count & Sort
df6 = df_company.groupby("company").count().sort("count", ascending=False)

# Which IIT is at the top?
df6.filter(df.company.startswithith("IIT")).show(20, False)

In [None]:
# Companies with most GitHub users

df6 = df_company.groupby("company").count().sort("count", ascending=False)

df6.show(20, False)

### Users Year Country Count

In [None]:
df4 = (
    df2
    .select(F.year(df2.created_at).name("year"), "country_code")
    .groupby("year", "country_code")
    .count()
    .sort("count", ascending=False)
)

# df4.show()

In [None]:
# df4.count()
df4.coalesce(1).write.json("user-year-country-count.json")

### Plot some charts

In [None]:
import pandas as pd
odf = pd.read_json(open("../outputs/user-year-country-count.json"))
odf = odf.loc[odf.year != 2017]
# odf['year'] = pd.to_datetime(odf['year'], format="%Y")

# odf.head(50).country_code.unique()
odf2 = odf.loc[odf.year == 2014]
odf2.sort_values("count")

odf3 = odf2.head(20)

# odf3.country_code.unique()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.figure()
odf3.plot.bar(x="country_code", y="count")

In [None]:
from bokeh.plotting import output_notebook; output_notebook()

In [None]:
from bokeh.io import show
from bokeh.palettes import Spectral11
from bokeh.plotting import figure
from bokeh.transform import factor_cmap

p = figure(
    x_range=(odf3.country_code.unique()),
    plot_height=500
)

p.vbar(
    source=odf3, 
    
    x='country_code', top='count', 
    width=1, 
    line_color='white', 
    fill_color=factor_cmap('country_code', palette=Spectral11, factors=odf3.country_code.unique())
)

show(p)

In [None]:
from bokeh.models import HoverTool
from bokeh.palettes import Spectral5

hover = HoverTool(tooltips=[
    ("year", "$x{(0000)}"),
    ("users", "$y{(0.00 a)}"),
])

p = figure(plot_height=500, y_axis_label="No. of Users", x_axis_label="Year", title="No. of users in countries")
p.add_tools(hover)

countries = ["us", "in", "cn", "gb", "de"]
colors = Spectral5

for i, country in enumerate(countries):
    odf4 = odf.loc[odf.country_code == country]
    odf4 = odf4.sort_values("year")

    x = odf4['year']
    y = odf4['count']

    color = colors[i % len(colors)]
    
    p.line(x, y, color=color, legend=country)
    p.circle(x, y, fill_color="white", size=8, color=color)


p.legend.location = "top_left"
    
show(p)

### Where do Indian GitHub users live?

In [7]:
df3 = df2.filter(
    df.country_code == "in"
)

# df3.show()

In [None]:
# NCR = ["Delhi", "New Delhi", "Gurgaon", "Noida", "Faridabad"]

df4 = (
    df3
    .where(df3.state.isNotNull())
#     .where(df3.city.isNotNull())
#     .select("city", F.when(df.city.isin(NCR), "Delhi (NCR)").otherwise(df.city).name("region"))
#     .select("city")
#     .groupby("region")
    .groupby("state")
    .count()
    .sort("count", ascending=False)
)

df4.show(50)

In [10]:
df4.coalesce(1).write.json("user-india-state-count.json")


### Number of new users per month

In [None]:
users.limit(10).show()
df3 = (
    users
    .where(users.created_at.isNotNull())
    .select(F.year(users.created_at).name("year"),
            F.month(users.created_at).name("month"))
    .groupby("year", "month")
    .count()
    .orderBy("count", ascending=False)
)

# df3.show()
df3.coalesce(1).write.json("user-year-month-count.json")

### Number of followers of each user

In [None]:
followers = spark.read.csv(
    path="/mnt/Data/GHTorrent/followers.csv",
    schema=spark_schema_from_json(db_schema["followers.csv"]),
    nullValue="\\N",
)

df2 = (
    followers
    .groupby("follower_id")
    .count()
    .withColumnRenamed("count", "following")
    .withColumnRenamed("follower_id", "user_id")
)

# print(df2.count())

df3 = (
    followers
    .groupby("user_id")
    .count()
    .withColumnRenamed("count", "followers")
)

# print(df3.count())

# df5 = df2.join(df3, "user_id", "full")

# df5.limit(100).show(100)

# print(df5.count())

### Number of repositories starred by each user

In [None]:
# Stars were previously called watchers
stars = spark.read.csv(
    path="/mnt/Data/GHTorrent/watchers.csv",
    schema=spark_schema_from_json(db_schema["watchers.csv"]),
    nullValue="\\N",
)

df4 = (
    stars
    .groupby("user_id")
    .count()
    .withColumnRenamed("count", "starred")
)

df5 = df2.join(df3, "user_id", "full").join(df4, "user_id", "full")

df5.write.csv(
    "user_more",
    mode="overwrite",
    nullValue="\\N"
)

### Number of repositories of each user

In [None]:
projects =  utils.read_csv(spark, "/mnt/Data/GHTorrent/projects_new.csv")
user_more =  utils.read_csv(spark, "/mnt/Data/GHTorrent/user_more.csv")

# Find source repos
df4 = (
    projects
    .where(projects.deleted == 0 & projects.forked_from.isNull())
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_source")
    .withColumnRenamed("owner_id", "user_id")
)

# Find forks
df6 = (
    projects
    .where(projects.deleted == 0 & projects.forked_from.isNotNull())
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_forks")
    .withColumnRenamed("owner_id", "user_id")
)

# Join Data
df5 = user_more.join(df4, "user_id", "full").join(df6, "user_id", "full")

# Write to local directory
df5.write.csv(
    "../user_more_2",
    mode="overwrite",
    nullValue="\\N"
)