## Stackexchange

In [None]:
!ls -lisah /data/dataset/stackexchange.com/unix.stackexchange.com/json/

## Init Spark

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("unix.stackexchange.com") \
    .enableHiveSupport() \
    .getOrCreate()

## Badges

In [None]:
!head /data/dataset/stackexchange.com/unix.stackexchange.com/json/Badges.json

In [None]:
path = "file:///data/dataset/stackexchange.com/unix.stackexchange.com/json/Badges.json"

In [None]:
badges = spark.read.json(path)

In [None]:
badges.show(3, truncate=False)

In [None]:
badges.printSchema()

# Inspect all Files

In [None]:
!hdfs dfs -rm -r "/dataset/unix.stackexchange.com"

In [None]:
def get_info(name):
    print(f"info for {name}")
    print("------------------------------------")
    path = f"file:///data/dataset/stackexchange.com/unix.stackexchange.com/json/{name}.json"
    df = spark.read.json(path)
    df.show(3, truncate=False)
    df.printSchema()
    return df
    
    #lower_name = name.lower()
    #df.repartition(15).write.parquet(f"/dataset/unix.stackexchange.com/{lower_name}.parquet")

In [None]:
all_names = !ls /data/dataset/stackexchange.com/unix.stackexchange.com/json/

In [None]:
all_names = [name[:-5] for name in all_names]

In [None]:
all_names

# Save as Parquet

In [None]:
def save_as_parquet(name, df):
    print(f"saving {name}")
    print("------------------------------------")
    
    df.show(3, truncate=False)
    df.printSchema()
    
    lower_name = name.lower()
    df.repartition(15).write.parquet(f"/dataset/unix.stackexchange.com/{lower_name}.parquet")

## Badges

In [None]:
df = get_info('Badges')

In [None]:
# https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/functions.html
# https://sparkbyexamples.com/spark/spark-sql-functions/
from pyspark.sql import functions as f

df.select(f.min("Class"), f.max("Class")).collect()

In [None]:
# https://sparkbyexamples.com/pyspark/pyspark-cast-column-type/#:~:text=In%20PySpark%2C%20you%20can%20cast,Boolean%20e.t.c%20using%20PySpark%20examples.


# Another way would be via Types
# https://sparkbyexamples.com/pyspark/pyspark-sql-types-datatype-with-examples/
# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
# from pyspark.sql.types import *


save_as_parquet("Badges", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(UserId as int) user_id", \
              "cast(Class as byte) class", \
              "cast(Name as string) name", \
              "cast(TagBased as boolean) tag_based", \
              "cast(Date as timestamp) date" \
             ))



## Comments

In [None]:
df = get_info('Comments')

In [None]:
df.filter("UserDisplayName is not null").show(2)

In [None]:
save_as_parquet("Comments", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(PostId as int) post_id", \
              "cast(UserId as int) user_id", \
              "cast(Score as byte) score", \
              "cast(ContentLicense as string) content_license", \
              "cast(UserDisplayName as string) user_display_name", \
              "cast(Text as String) text", \
              "cast(CreationDate as timestamp) creation_date" \
             ))

## PostHistory

In [None]:
df = get_info('PostHistory')

In [None]:
save_as_parquet("PostHistory", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(PostId as int) post_id", \
              "cast(UserId as int) user_id", \
              "cast(PostHistoryTypeId as byte) post_history_type_id", \
              "cast(UserDisplayName as string) user_display_name", \
              "cast(ContentLicense as string) content_license", \
              "cast(RevisionGUID as string) revision_guid", \
              "cast(Text as String) text", \
              "cast(Comment as String) comment", \
              "cast(CreationDate as timestamp) creation_date" \
             ))

## PostLinks

In [None]:
df = get_info('PostLinks')

In [None]:
save_as_parquet("PostLinks", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(RelatedPostId as int) related_post_id", \
              "cast(PostId as int) post_id", \
              "cast(LinkTypeId as byte) link_type_id", \
              "cast(CreationDate as timestamp) creation_date" \
             ))

## Posts

In [None]:
df = get_info('Posts')

In [None]:
save_as_parquet("Posts", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(OwnerUserId as int) owner_user_id", \
              "cast(LastEditorUserId as int) last_editor_user_id", \
              "cast(PostTypeId as short) post_type_id", \
              "cast(AcceptedAnswerId as int) accepted_answer_id", \
              "cast(Score as int) score", \
              "cast(ParentId as int) parent_id", \
              "cast(ViewCount as int) view_count", \
              "cast(AnswerCount as int) answer_count", \
              "cast(CommentCount as int) comment_count", \
              "cast(OwnerDisplayName as string) owner_display_name", \
              "cast(LastEditorDisplayName as string) last_editor_display_name", \
              "cast(Title as String) title", \
              "cast(Tags as String) tags", \
              "cast(ContentLicense as string) content_license", \
              "cast(Body as string) body", \
              "cast(FavoriteCount as int) favorite_count", \
              "cast(CreationDate as timestamp) creation_date", \
              "cast(CommunityOwnedDate as timestamp) community_owned_date", \
              "cast(ClosedDate as timestamp) closed_date", \
              "cast(LastEditDate as timestamp) last_edit_date", \
              "cast(LastActivityDate as timestamp) last_activity_date" \
             ))

## Tags

In [None]:
df = get_info('Tags')

In [None]:
save_as_parquet("Tags", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(ExcerptPostId as int) excerpt_post_id", \
              "cast(WikiPostId as int) wiki_post_id", \
              "cast(TagName as string) tag_name", \
              "cast(Count as int) count" \
             ))

## Users

In [None]:
df = get_info('Users')

In [None]:
save_as_parquet("Users", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(AccountId as int) account_id", \
              "cast(Reputation as int) reputation", \
              "cast(Views as int) views", \
              "cast(DownVotes as int) down_votes", \
              "cast(UpVotes as int) up_votes", \
              "cast(DisplayName as string) display_name", \
              "cast(Location as string) location", \
              "cast(ProfileImageUrl as string) profile_image_url", \
              "cast(WebsiteUrl as string) website_url", \
              "cast(AboutMe as string) about_me", \
              "cast(CreationDate as timestamp) creation_date", \
              "cast(LastAccessDate as timestamp) last_access_date" \
             ))

## Votes

In [None]:
df = get_info('Votes')

In [None]:
save_as_parquet("Votes", df.selectExpr(\
              "cast(Id as int) id", \
              "cast(UserId as int) user_id", \
              "cast(PostId as int) post_id", \
              "cast(VoteTypeId as byte) vote_type_id", \
              "cast(BountyAmount as byte) bounty_amount", \
              "cast(CreationDate as timestamp) creation_date" \
             ))

# Analysing

In [None]:
def get_path(name):
    return f"/dataset/unix.stackexchange.com/{name}.parquet"

## Read all Parquets

In [None]:
badges = spark.read.parquet(get_path("badges"))
comments = spark.read.parquet(get_path("comments"))
posthistory = spark.read.parquet(get_path("posthistory"))
postlinks = spark.read.parquet(get_path("postlinks"))
posts = spark.read.parquet(get_path("posts"))
tags = spark.read.parquet(get_path("tags"))
users = spark.read.parquet(get_path("users"))
votes = spark.read.parquet(get_path("votes"))

In [None]:
print("badges")
badges.show(3)
print("comments")
comments.show(3)
print("posthistory")
posthistory.show(3)
print("postlinks")
postlinks.show(3)
print("posts")
posts.show(3)
print("tags")
tags.show(3)
print("users")
users.show(3)
print("votes")
votes.show(3)

## Tags

In [None]:
tags = spark.read.parquet(get_path("tags"))

In [None]:
tags.show()

In [None]:
from pyspark.sql import functions as f

tags.filter(f.col("tag_name") == "async").show()

In [None]:
tags.filter("tag_name = 'async'").show()

In [None]:
tags.filter("tag_name like '%async%'").show()

In [None]:
tags.filter(f.col("tag_name").like('%async%')).show()

In [None]:
tags.select("tag_name", "count").orderBy(f.col("count").desc()).show(20)

### Wordcloud

Needs the `wordcloud` (and `matplotlib` which comes as a dependency) python package

```
pip install wordcloud
```

see [documentation](https://github.com/amueller/word_cloud)

In [None]:
filtered_tags = tags.select("tag_name", "count").orderBy(f.col("count").desc()).filter("count > 100")

In [None]:
filtered_tags.show(2)
filtered_tags.count()

In [None]:
frequencies = filtered_tags.toPandas().set_index('tag_name').T.to_dict('records')[0]

In [None]:
frequencies['linux']

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


wordcloud = WordCloud(width=2000, height=1000)
wordcloud.generate_from_frequencies(frequencies)


plt.figure(figsize=(20,30))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')


plt.savefig("./wordcloud.png")


In [None]:
users.printSchema()

In [None]:
print(users.count())
print(users.filter("id is not null").count())
print(users.filter("id is not null").distinct().count())

In [None]:
users. \
    select("account_id", "display_name", "views", "down_votes", "up_votes", "reputation"). \
    show(2)


In [None]:
# most reputation
# https://stackexchange.com/users/{account_id}/
users. \
    select("account_id", "display_name", "views", "down_votes", "up_votes", "reputation"). \
    orderBy(f.col("reputation").desc()). \
    show(10, False)


In [None]:
# most viewed
users. \
    select("account_id", "display_name", "views", "down_votes", "up_votes", "reputation"). \
    orderBy(f.col("views").desc()). \
    show(10, False)

In [None]:
# downvoters
users. \
    select("account_id", "display_name", "views", "down_votes", "up_votes", "reputation"). \
    orderBy(f.col("down_votes").desc()). \
    show(10, False)

## Analysing a Question

- [83577](https://unix.stackexchange.com/questions/83577/how-to-invoke-vim-with-line-numbers-shown)

In [None]:
posts.filter("id = 83577").toPandas().T

In [None]:
posts.filter("id = 648583").toPandas().T

In [None]:
posts.filter("id = 648608").toPandas().T

In [None]:
posts.select("parent_id").groupBy("parent_id").count().sort(f.desc("count")).show(20)

## Counts

- inspired from [davidvrba](https://github.com/davidvrba/Stackoverflow-Data-Analysis)

In [None]:
posts.count()

In [None]:
# 1 = Question
# 2 = Answer
# 3 = Orphaned tag wiki
# 4 = Tag wiki excerpt
# 5 = Tag wiki
# 6 = Moderator nomination
# 7 = "Wiki placeholder" (seems to only be the election description)
# 8 = Privilege wiki

questions = posts.filter(f.col('post_type_id') == 1)
answers = posts.filter(f.col('post_type_id') == 2)

In [None]:
print(questions.count())
print(answers.count())

In [None]:
# questions with accepted answer

questions.filter(f.col('accepted_answer_id').isNotNull()).count()

In [None]:
# count users

print(posts.filter(f.col('owner_user_id').isNotNull()).select('owner_user_id').distinct().count())
print(users.filter("id is not null").select("id").distinct().count())

## Response Time

In [None]:
response_time = (
    questions.alias('questions')
    .join(answers.alias('answers'), f.col('questions.accepted_answer_id') == f.col('answers.id'))
    .select(
        f.col('questions.id'),
        f.col('questions.creation_date').alias('question_time'),
        f.col('answers.creation_date').alias('answer_time')
    )
    .withColumn('response_time', f.unix_timestamp('answer_time') - f.unix_timestamp('question_time'))
    .filter('response_time > 0')
    .orderBy('response_time')
)

In [None]:
response_time.show(2, False)

In [None]:
response_time = (
    questions.alias('questions')
    .join(answers.alias('answers'), f.col('questions.accepted_answer_id') == f.col('answers.id'))
    .filter(f.col("questions.owner_user_id") != f.col("answers.owner_user_id"))
    .select(
        f.col('questions.id'),
        f.col('questions.creation_date').alias('question_time'),
        f.col('answers.creation_date').alias('answer_time')
    )
    .withColumn('response_time', f.unix_timestamp('answer_time') - f.unix_timestamp('question_time'))
    .filter('response_time > 0')
    .orderBy('response_time')
)
    


In [None]:
response_time.show(5, False)

## Hourly Data

In [None]:
hourly_data = (
    response_time
    .withColumn('hours', f.hour("answer_time"))
).show(2)

In [None]:
hourly_data = (
    response_time
    .withColumn('hours', f.hour("answer_time"))
    .groupBy('hours')
    .count()
    .orderBy('hours')
    .limit(24)
).toPandas()

In [None]:
hourly_data.plot(
    x='hours', y='count', figsize=(12, 6), 
    title='Answer Hour',
    legend=False,
    kind='bar',
    xlabel='Hour',
    ylabel='Number of answered questions'
)

In [None]:
year_data = (
    response_time
    .withColumn('years', f.year("answer_time"))
    .groupBy('years')
    .count()
    .orderBy('years')
).toPandas()

In [None]:
year_data.plot(
    x='years', y='count', figsize=(12, 6), 
    title='Answer Year',
    legend=False,
    kind='bar',
    xlabel='Year',
    ylabel='Number of answered questions'
)

In [None]:
response_hours = (
    response_time
    .withColumn('hours', f.ceil(f.col('response_time') / 3600))
    .groupBy('hours')
    .count()
    .orderBy('hours')
    .limit(48)
).toPandas()

In [None]:
response_hours.plot(
    x='hours', y='count', figsize=(12, 6), 
    title='Response time of questions',
    legend=False,
    kind='bar',
    xlabel='Hour',
    ylabel='Number of answered questions'
)

## See the time evolution of the number of questions and answers

In [None]:
posts_grouped = (
    posts
    .filter('owner_user_id is not null')
    .groupBy(
        f.window('creation_date', '1 week')
    )
    .agg(
        f.sum(f.when(f.col('post_type_id') == 1, f.lit(1)).otherwise(f.lit(0))).alias('questions'),
        f.sum(f.when(f.col('post_type_id') == 2, f.lit(1)).otherwise(f.lit(0))).alias('answers')
    )
    .withColumn('date', f.col('window.start').cast('date'))
    .orderBy('date')
).toPandas()

In [None]:
posts_grouped

In [None]:
posts_grouped.plot(
    x='date', 
    figsize=(12, 6), 
    title='Number of questions/answers per week',
    legend=True,
    xlabel='Date',
    ylabel='Number of answers',
    kind='line'
)

In [None]:
posts_grouped_month = (
    posts
    .filter('owner_user_id is not null')
    .groupBy(
        f.window('creation_date', '4 weeks')
    )
    .agg(
        f.sum(f.when(f.col('post_type_id') == 1, f.lit(1)).otherwise(f.lit(0))).alias('questions'),
        f.sum(f.when(f.col('post_type_id') == 2, f.lit(1)).otherwise(f.lit(0))).alias('answers')
    )
    .withColumn('date', f.col('window.start').cast('date'))
    .orderBy('date')
).toPandas()

In [None]:
posts_grouped_month.plot(
    x='date', 
    figsize=(12, 6), 
    title='Number of questions/answers per week',
    legend=True,
    xlabel='Date',
    ylabel='Number of answers',
    kind='line'
)

# Tags

In [None]:
vi_sudo_tag = (
    questions
    .select('id', 'creation_date', 'tags')
    .groupBy(
        f.window('creation_date', "4 weeks")
    )
    .agg(
        f.sum(f.when(questions.tags.contains("nano"), f.lit(1)).otherwise(f.lit(0))).alias('nano'),
        f.sum(f.when(questions.tags.contains("vim"), f.lit(1)).otherwise(f.lit(0))).alias('vim')
    )
    .withColumn('date', f.col('window.start').cast('date'))
    .orderBy('date')
).toPandas()

In [None]:
vi_sudo_tag

In [None]:
vi_sudo_tag.plot(
    x='date', 
    figsize=(12, 6), 
    legend=True,
    xlabel='Date',
    ylabel='Number of questions',
    kind='line'
)

# Stopping Spark

In [None]:
spark.stop()