In [1]:
import json

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.types import *
import pyspark.sql.types as spark_types

import utils

spark = SparkSession.builder.master("local").appName("GH Users").getOrCreate()

In [2]:
df = utils.read_csv(spark, path="/mnt/Data/GHTorrent/users.csv")
users = df
# print(df.count())

# Users who have perfectly mapped location data
df2 = df.filter(df.country_code.isNotNull())
# print(df2.count())

# Users who are from India
df4 = df2.where(df.country_code == "in")
print(df4.count())

# Users who have data in location field but which couldn't be mapped to a
# country etc.
# df3 = df.filter(df.country_code.isNull() & df.location.isNotNull())

# df3.count()

# Such users from India?
# df4 = df3.filter(df.location.rlike("\\bIndia\\b"))

# People who claim to be from India
# df5 = df_company.filter(df.location.rlike("\\bIndia\\b"))

102505


### Companies with most GitHub users 

In [16]:
not_really_companies = ["NA", "N/A", "None", "none", "-", "Private", "home", "Personal", "Student", "student", "self", "Self", "Home", "Freelance", "Freelancer"]

not_really_companies += ["Japan", "China", "MIT", "UC Berkeley"]

df_company = (
    df
    .where(
        (df.company.isNotNull()) 
        & ~ (df.company.isin(not_really_companies)) 
        & ~ (df.company.contains("CLICK "))
        
        # Currently we're only focussing on Corporates
#         & ~ (df.company.contains("University"))
#         & ~ (df.company.contains("IIT"))
    )
)

MSFT = ["Microsoft Corporation", "Microsoft"]

# Count & Sort
df6 = (
    df_company
    .where(
#         (df_company.country_code == "in")
         (df_company.company.startswith("IIT"))
        | (df_company.company.startswith("Indian"))
    )
#     .select("company", F.when(df_company.company.isin(MSFT), "Microsoft").otherwise(df_company.company).name("corporate"))
    .groupby("company")
    .count()
    .withColumnRenamed("count", "num_employees")
    .sort("count", ascending=False)
)

df6.show(50, truncate=False)

+------------------------------------------------+-------------+
|company                                         |num_employees|
+------------------------------------------------+-------------+
|IIT Bombay                                      |206          |
|Indiana University                              |187          |
|IIT Kharagpur                                   |115          |
|IIT Madras                                      |90           |
|IIT Kanpur                                      |85           |
|IIT Delhi                                       |72           |
|IIT Guwahati                                    |70           |
|IIT Roorkee                                     |61           |
|Indian Institute of Science                     |56           |
|IIT                                             |43           |
|Indiana University Bloomington                  |35           |
|Indian Institute of Technology                  |31           |
|IIT Jodhpur             

### Users Year Country Count

In [None]:
df4 = (
    df2
    .select(F.year(df2.created_at).name("year"), "country_code")
    .groupby("year", "country_code")
    .count()
    .sort("count", ascending=False)
)

# df4.show()

In [None]:
# df4.count()
df4.coalesce(1).write.json("user-year-country-count.json")

### Pie chart of countries distribution

In [1]:
import pandas as pd
uc = pd.read_json(open("../outputs/user-year-country-count.json"))
# uc
nuc = uc.groupby("country_code", as_index=False)["count"].sum()
nuc.sort_values("count", inplace=True, ascending=False)

top = nuc.head(21).copy()
top_countries = list(top["country_code"])

others = nuc.loc[~nuc.country_code.isin(top_countries)]

top = top.reset_index(drop=True)
top.loc[len(top)] = ["others", others["count"].sum()]

top

Unnamed: 0,country_code,count
0,us,481311
1,in,102505
2,cn,96327
3,gb,85605
4,de,70331
5,br,52196
6,ca,51814
7,fr,48813
8,ru,42426
9,au,31885


In [None]:
from bokeh.charts import Donut, show
import pandas as pd
data = pd.Series([0.15,0.4,0.7,1.0], index = list('abcd'))
pie_chart = Donut(data)
show(pie_chart)

### Plot some charts

In [None]:
import pandas as pd
odf = pd.read_json(open("../outputs/user-year-country-count.json"))
odf = odf.loc[odf.year != 2017]
# odf['year'] = pd.to_datetime(odf['year'], format="%Y")

# odf.head(50).country_code.unique()
odf2 = odf.loc[odf.year == 2014]
odf2.sort_values("count")

odf3 = odf2.head(20)

# odf3.country_code.unique()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.figure()
odf3.plot.bar(x="country_code", y="count")

In [None]:
from bokeh.plotting import output_notebook; output_notebook()

### Where do Indian GitHub users live?

In [None]:
df3 = df2.filter(
    df.country_code == "in"
)

# df3.show()

In [None]:
# NCR = ["Delhi", "New Delhi", "Gurgaon", "Noida", "Faridabad"]

df4 = (
    df3
    .where(df3.state.isNotNull())
#     .where(df3.city.isNotNull())
#     .select("city", F.when(df.city.isin(NCR), "Delhi (NCR)").otherwise(df.city).name("region"))
#     .select("city")
#     .groupby("region")
    .groupby("state")
    .count()
    .sort("count", ascending=False)
)

df4.show(50)

In [None]:
df4.coalesce(1).write.json("user-india-state-count.json")


### Number of new users per month

In [None]:
users.limit(10).show()
df3 = (
    users
    .where(users.created_at.isNotNull())
    .select(F.year(users.created_at).name("year"),
            F.month(users.created_at).name("month"))
    .groupby("year", "month")
    .count()
    .orderBy("count", ascending=False)
)

# df3.show()
df3.coalesce(1).write.json("user-year-month-count.json")

### Number of followers of each user

In [None]:
followers = spark.read.csv(
    path="/mnt/Data/GHTorrent/followers.csv",
    schema=spark_schema_from_json(db_schema["followers.csv"]),
    nullValue="\\N",
)

df2 = (
    followers
    .groupby("follower_id")
    .count()
    .withColumnRenamed("count", "following")
    .withColumnRenamed("follower_id", "user_id")
)

# print(df2.count())

df3 = (
    followers
    .groupby("user_id")
    .count()
    .withColumnRenamed("count", "followers")
)

# print(df3.count())

# df5 = df2.join(df3, "user_id", "full")

# df5.limit(100).show(100)

# print(df5.count())

### Number of repositories starred by each user

In [None]:
# Stars were previously called watchers
stars = spark.read.csv(
    path="/mnt/Data/GHTorrent/watchers.csv",
    schema=spark_schema_from_json(db_schema["watchers.csv"]),
    nullValue="\\N",
)

df4 = (
    stars
    .groupby("user_id")
    .count()
    .withColumnRenamed("count", "starred")
)

df5 = df2.join(df3, "user_id", "full").join(df4, "user_id", "full")

df5.write.csv(
    "user_more",
    mode="overwrite",
    nullValue="\\N"
)

### Number of repositories of each user

In [None]:
projects =  utils.read_csv(spark, "/mnt/Data/GHTorrent/projects_new.csv")
user_more =  utils.read_csv(spark, "/mnt/Data/GHTorrent/user_more.csv")

# Find source repos
df4 = (
    projects
    .where(projects.deleted == 0 & projects.forked_from.isNull())
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_source")
    .withColumnRenamed("owner_id", "user_id")
)

# Find forks
df6 = (
    projects
    .where(projects.deleted == 0 & projects.forked_from.isNotNull())
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_forks")
    .withColumnRenamed("owner_id", "user_id")
)

# Join Data
df5 = user_more.join(df4, "user_id", "full").join(df6, "user_id", "full")

# Write to local directory
df5.write.csv(
    "../user_more_2",
    mode="overwrite",
    nullValue="\\N"
)

### Analysis on Users x More

In [3]:
um = utils.read_csv(spark, "/home/dufferzafar/dev/github-analytics/_Help/users_with_more.csv") 

jamians = [1432224, 5107602, 4007006, 6145009, 2859386, 4925305, 2549876]
jamians = [
    "dufferzafar",
    "kwikadi",
    "iCHAIT",
    "nickedes",
    "prerit2010",
    "TrigonaMinima",
    "rootAvish",
]

res = (
    um
    .where(
          (um.type == "USR")
#         & (um.deleted == 0)
#         & (um.fake == 0)
#         & (um.country_code == "in")
        & (um.login.isin(jamians))
    )
#     .select("login", "following", "followers", "starred", "repos_source", "repos_forks", "has_stars", "contributes_to", "issues", "pulls", "commits", "commits_others")
)

res.limit(10).show()

+-------+-------------+-------+-------------------+----+----+-------+-----------+-----------+------------+-----+---------+----------------+---------+---------+-------+------------+-----------+---------+--------------+------+-----+-------+--------------+
|     id|        login|company|         created_at|type|fake|deleted|       long|        lat|country_code|state|     city|        location|following|followers|starred|repos_source|repos_forks|has_stars|contributes_to|issues|pulls|commits|commits_others|
+-------+-------------+-------+-------------------+----+----+-------+-----------+-----------+------------+-----+---------+----------------+---------+---------+-------+------------+-----------+---------+--------------+------+-----+-------+--------------+
|4007006|      kwikadi|   null|2013-05-31 07:44:24| USR|   0|      0|79.45809174|23.40601158|          in| null|     null|           India|       21|       27|     26|          17|          9|        5|            14|     7|    1|    390|