In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.sql.types import *
import pyspark.sql.types as spark_types

import utils

spark = SparkSession.builder.master("spark://vm1:7077").appName("Cluster Code - Zafar").getOrCreate()

In [None]:
# Testing to see if everything is okay
commits = utils.read_csv(spark, "hdfs:/commits_new.csv")
commits.limit(10).show()

### Number of source & fork repositories of users (shouldn't be run again)

In [5]:
projects =  utils.read_csv(spark, "hdfs:/projects.csv", "projects_new.csv")

# This file has now been replaced with a new version
user_more =  utils.read_csv(spark, "hdfs:/user_more.csv")

# Find source repos
df4 = (
    projects
    .where((projects.deleted == 0) & (projects.forked_from.isNull()))
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_source")
    .withColumnRenamed("owner_id", "user_id")
)

# Find forks
df6 = (
    projects
    .where((projects.deleted == 0) & (projects.forked_from.isNotNull()))
    .groupby("owner_id")
    .count()
    .withColumnRenamed("count", "repos_forks")
    .withColumnRenamed("owner_id", "user_id")
)

# Join Data
df5 = user_more.join(df4, "user_id", "full").join(df6, "user_id", "full")

# Write to local directorya
df5.write.csv(
    "/user_more_2",
    mode="overwrite",
    nullValue="\\N"
)

### Issue Punchcard

In [5]:
issues = utils.read_csv(spark, "hdfs:/issues.csv")

df5 = (
    issues
    .where(
        issues.created_at.isNotNull()
    )
    .select(
        F.year('created_at').alias('year'), 
        F.month('created_at').alias('month'), 
        F.dayofmonth('created_at').alias('day'), 
        F.hour('created_at').alias('hour')
    )
   .groupBy('year', 'month', 'day', 'hour')
   .count()
)

df5.limit(10).show()

# df5.explain()
# df5.coalesce(1).write.json("hdfs:/issue_punchcard")

+----+-----+----+----+--------+
|year|month| day|hour|   count|
+----+-----+----+----+--------+
|null| null|null|null|54086297|
+----+-----+----+----+--------+



### Number of Commits of each user

In [15]:
commits = utils.read_csv(spark, "hdfs:/commits_new.csv")

res = (
    commits
    .groupby("author_id")
    .count()
    .withColumnRenamed("count", "commits_authored")
    .withColumnRenamed("author_id", "user_id")
)

res.write.csv(
    "hdfs:/user_commit_count",
    mode="overwrite",
    nullValue="\\N"
)

### Number of Commits of every project

In [16]:
# commits = utils.read_csv(spark, "hdfs:/commits_new.csv")

res = (
    commits
    .groupby("project_id")
    .count()
    .withColumnRenamed("count", "total_commits")
)

res.write.csv(
    "hdfs:/project_commit_count",
#     mode="overwrite",
    nullValue="\\N"
)

### Number of Commits on a project not authored by Owner

In [17]:
commits = utils.read_csv(spark, "hdfs:/commits_new.csv")
projects = utils.read_csv(spark, "hdfs:/projects_new.csv")

commits.createOrReplaceTempView("commits")
projects.createOrReplaceTempView("projects")

q = """
    SELECT C.project_id as project_id, COUNT(*) as total_commits_by_others
    FROM commits as C, projects as P
    WHERE C.project_id = P.id
    AND C.author_id <> P.owner_id
    GROUP BY C.project_id
"""

res = spark.sql(q)

res.write.csv(
    "hdfs:/project_commit_others_count",
#     mode="overwrite",
    nullValue="\\N"
)

### Number of commits of a user made on repositories not owned by them

In [18]:
q = """
    SELECT C.author_id as user_id, 
           COUNT(*) as total_commits_on_other_repos

    FROM commits as C, projects as P

    WHERE C.project_id = P.id
    AND C.author_id <> P.owner_id

    GROUP BY C.author_id
"""

res = spark.sql(q)

res.write.csv(
    "hdfs:/user_commit_others_count",
#     mode="overwrite",
    nullValue="\\N"
)

### "Top" Users

In [24]:
users_more = utils.read_csv(spark, "hdfs:/users_more.csv")
users = utils.read_csv(spark, "hdfs:/users.csv")

users = (
    users
    .where(
          (users.type == "USR")
        & (users.deleted == 0)
        & (users.fake == 0)
    )
    .withColumnRenamed("id", "user_id")
    .select("user_id", "login", "company")
)

user_new = users.join(users_more, "user_id", "left")

top = (
    user_new
#     .orderBy("followers", ascending=False)
    .orderBy("has_stars", ascending=False)
)

top.show(50)

+-------+-------------+--------------------+---------+---------+-------+------------+-----------+---------+---------------+------+-----+-------+--------------+
|user_id|        login|             company|following|followers|starred|repos_source|repos_forks|has_stars|contributess_to|issues|pulls|commits|commits_others|
+-------+-------------+--------------------+---------+---------+-------+------------+-----------+---------+---------------+------+-----+-------+--------------+
|   3871| sindresorhus|@avajs @chalk @ye...|      188|    15755|   2274|         974|        142|   226099|            565|  3080|  108|  26326|          8341|
|6498757| FreeCodeCamp|                null|        1|      110|      6|          95|          5|   214583|           null|     1| null|     18|          null|
| 376498|           Tj|                Apex|      175|    25827|    966|         238|         81|   114326|            138|   873| null|   4922|          3539|
| 234594|       docker|                n

### "Top" Users from India

In [34]:
users_more = utils.read_csv(spark, "hdfs:/users_more.csv")
users = utils.read_csv(spark, "hdfs:/users.csv")

users = (
    users
    .where(
          (users.type == "USR")
        & (users.deleted == 0)
        & (users.fake == 0)
        & (users.country_code == "in")
    )
    .withColumnRenamed("id", "user_id")
    .select("user_id", "login", "company", "state", "city")
)

users_more = (
    users_more
    .select("user_id", "followers", "has_stars", "contributes_to", "issues", "pulls", "commits", "commits_others")
)

user_new = (
    users
    .join(users_more, "user_id", "left")
)

top = (
    user_new
#     .orderBy("followers", ascending=False)
    .orderBy("has_stars", ascending=False)
#     .orderBy("contributes_to", ascending=False)
)

top.show(50)

+--------+-----------------+--------------------+--------------+---------+---------+---------+--------------+------+-----+-------+--------------+
| user_id|            login|             company|         state|     city|followers|has_stars|contributes_to|issues|pulls|commits|commits_others|
+--------+-----------------+--------------------+--------------+---------+---------+---------+--------------+------+-----+-------+--------------+
|   23402|          hemanth|             @paypal|          null|     null|     1209|    14032|           148|   505|   39|   5212|          1420|
| 6310255|amitshekhariitbhu|          Bobble App|         Delhi|New Delhi|      311|    10360|             7|    10| null|   1581|           509|
| 3408218|    sachinchoolur|Available for New...|     Karnataka|Bengaluru|      255|     9799|             3|     1|    1|    901|            53|
|  163695|        chinchang|           @wingify |         Delhi|    Delhi|      468|     9326|             9|   136|    2|  

In [35]:
users_more = utils.read_csv(spark, "hdfs:/users_more.csv")
users = utils.read_csv(spark, "hdfs:/users.csv")

users = (
    users
    .withColumnRenamed("id", "user_id")
#     .select("user_id", "login", "company", "state", "city")
)

users_more = (
    users_more
#     .select("user_id", "followers", "has_stars", "contributes_to", "issues", "pulls", "commits", "commits_others")
)

res = (
    users
    .join(users_more, "user_id", "left")
)

res.write.csv(
    "hdfs:/users_with_more",
    nullValue="\\N"
)

### Commits Punchcard for top users

In [36]:
commits = utils.read_csv(spark, "hdfs:/commits_new.csv")
projects = utils.read_csv(spark, "hdfs:/projects_new.csv")

# top_users = [5203, 896, 376498, 6240, 1779, 9236, 1570, 3871, 1736, 13009, 24452, 616741, 2468643, 2427, 81423, 796, 10005, 417948, 2016667, 1954]
jamians = [1432224, 5107602, 4007006, 6145009, 2859386, 4925305, 2549876]

res = (
    commits
    .where(
        (commits.created_at.isNotNull())
#         & (commits.author_id.isin(top_users))
        & (commits.author_id.isin(jamians))
    )
    .select(
        "author_id",
        F.date_format('created_at', 'E').alias('day'),
        F.hour('created_at').alias('hour')
    )
   .groupBy(
    'author_id',
    'day',
    'hour'
   )
   .count()
   .withColumnRenamed("count", "commits")
#    .withColumnRenamed("author_id", "user_id")
)

res.write.json(
    "hdfs:/jamians_commit_punchcard"
)

### Commits Punchcard for EVERYONE!

In [8]:
res = (
    commits
    .where(
        (commits.created_at.isNotNull())
    )
    .select(
        F.date_format('created_at', 'E').alias('day'),
        F.hour('created_at').alias('hour')
    )
   .groupBy(
    'day',
    'hour'
   )
   .count()
   .withColumnRenamed("count", "commits")
)

res.write.json(
    "hdfs:/commit_punchcard"
)

### Projects (More Data)

In [14]:
projects =  utils.read_csv(spark, "hdfs:/projects_new.csv")
stars = utils.read_csv(spark, "hdfs:/watchers.csv")
issues = utils.read_csv(spark, "hdfs:/issues.csv")

pcommits = utils.read_csv(spark, "hdfs:/project_commit_count.csv")
pcommits_others = utils.read_csv(spark, "hdfs:/project_commit_others_count.csv")

members = utils.read_csv(spark, "hdfs:/project_members_new.csv")

# Find stars on a repo
pstars = (
    stars
    .groupby("repo_id")
    .count()
    .withColumnRenamed("count", "stars")
    .withColumnRenamed("repo_id", "project_id")
)

# Find forks of a repo
pforks = (
    projects
    .where(projects.forked_from.isNotNull())
    .groupby("forked_from")
    .count()
    .withColumnRenamed("count", "forks")
    .withColumnRenamed("forked_from", "project_id")
)

# Find total issues on a repo
pissues = (
    issues
    .where(issues.pull_request_id.isNull())
    .groupby("repo_id")
    .count()
    .withColumnRenamed("count", "issues")
    .withColumnRenamed("repo_id", "project_id")
)

# Find pull requests on a repo
ppullreqs = (
    issues
    .where(issues.pull_request_id.isNotNull())
    .groupby("repo_id")
    .count()
    .withColumnRenamed("count", "pull_requests")
    .withColumnRenamed("repo_id", "project_id")
)

# Find members of a repo
pmembers = (
    members
    .groupby("project_id")
    .count()
    .withColumnRenamed("count", "contributors")
)

# Join Data
pmore = (
    pstars
    .join(pforks, "project_id", "full")
    .join(pmembers, "project_id", "full")
    .join(pissues, "project_id", "full")
    .join(ppullreqs, "project_id", "full")
    .join(pcommits, "project_id", "full")
    .join(pcommits_others, "project_id", "full")
)

# pmore.show()

pmore.write.csv(
    "hdfs:/projects_more",
    mode="overwrite",
    nullValue="\\N"
)

### Users (More Data)

In [18]:
issues = utils.read_csv(spark, "hdfs:/issues.csv")
members = utils.read_csv(spark, "hdfs:/project_members_new.csv")

user_more = utils.read_csv(spark, "hdfs:/user_more.csv")

ucommits = utils.read_csv(spark, "hdfs:/user_commit_count.csv")
ucommits_others = utils.read_csv(spark, "hdfs:/user_commit_others_count.csv")

projects = utils.read_csv(spark, "hdfs:/projects_new.csv")
projects_more = utils.read_csv(spark, "hdfs:/projects_more")

projects.createOrReplaceTempView("projects")
projects_more.createOrReplaceTempView("projects_more")

# Find total stars that a users repos have recieved
q = """
    SELECT P.owner_id as user_id, sum(M.stars) as stars_on_repos
    FROM projects as P, projects_more as M
    WHERE P.id = M.project_id
    GROUP BY P.owner_id
"""

ustars = spark.sql(q)

# Find total repos a user is members of
umembers = (
    members
    .groupby("user_id")
    .count()
    .withColumnRenamed("count", "members_of_repos")
)


# Find total issues by user
uissues = (
    issues
    .where(issues.pull_request_id.isNull())
    .groupby("reporter_id")
    .count()
    .withColumnRenamed("count", "issues")
    .withColumnRenamed("reporter_id", "user_id")
)

# Find pull requests by user
upullreqs = (
    issues
    .where(issues.pull_request_id.isNotNull())
    .groupby("reporter_id")
    .count()
    .withColumnRenamed("count", "pull_requests")
    .withColumnRenamed("reporter_id", "user_id")
)

# Join Data
umore = (
    user_more
    .join(ustars, "user_id", "full")
    .join(umembers, "user_id", "full")
    .join(uissues, "user_id", "full")
    .join(upullreqs, "user_id", "full")
    .join(ucommits, "user_id", "full")
    .join(ucommits_others, "user_id", "full")
)

umore.write.csv(
    "hdfs:/users_most",
    mode="overwrite",
    nullValue="\\N"
)

DataFrame[project_id: string, stars: bigint, forks: bigint, contributors: bigint, issues: bigint, pull_requests: bigint, commits: bigint, commits_by_others: bigint]