In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
# load in all the users with some sort of valid country field, 
# and also the questions and answers in year 2016 to 2019
allUsers = pd.read_csv('user data/stackoverflow_users_all_notext_clean.csv')
# questions = pd.read_csv('user data/stackoverflow_questions_2016-2019_notext.csv')
# answers = pd.read_csv('user data/stackoverflow_answers_2016-2019_notext.csv')

In [5]:
allUsers[allUsers.id == 22656]

Unnamed: 0.1,Unnamed: 0,id,age,creation_date,reputation,up_votes,down_votes,country_iso3
792684,792684,22656,,2008-09-26 12:05:05.15 UTC,1128126,16420,6921,GBR


# Merging user data with questions

In [3]:
# take a look at the data
print(allUsers.shape[0])
allUsers.rename(columns={'id': 'user_id', 'reputation': 'user_reputation'}, inplace=True)
allUsers.drop(columns='Unnamed: 0', inplace=True)
allUsers.head(3)

2785261


Unnamed: 0,user_id,age,creation_date,user_reputation,up_votes,down_votes,country_iso3
0,7019478,,2016-10-14 13:36:30.96 UTC,1,0,0,SGP
1,7074992,,2016-10-26 12:11:08.41 UTC,1,0,0,EGY
2,7093392,,2016-10-31 02:19:04.64 UTC,1,0,0,USA


In [4]:
user_id_count = allUsers.user_id.value_counts()
duplicated_id = list(user_id_count[user_id_count > 1].index) # 344 duplicated entries

Those entries with duplicated ids are the same. Therefore, we can just keep the distinct users.

In [12]:
# drop duplicated user entries    
allUsers.drop_duplicates(subset='user_id', inplace=True)
allUsers.shape[0]

2784917

In [6]:
print(questions.shape[0])
questions.head(3)

7797470


Unnamed: 0,Id,answer_count,comment_count,creation_date,owner_user_id,score,view_count,favorite_count,tags
0,34553230,1,0,2016-01-01 03:06:31.22 UTC,401226.0,0,256,,pythonmacports
1,34558566,1,0,2016-01-01 17:56:58.5 UTC,2702781.0,1,256,,scalasbtsbt-plugin
2,34577016,1,0,2016-01-03 13:26:04.52 UTC,4675736.0,0,256,,phpcodeigniterlaravel-5.1


In [10]:
questions[['Id', 'owner_user_id']].groupby('owner_user_id').agg('count').shape[0]

2126958

In [18]:
questions.loc[questions.owner_user_id.isin(allUsers.user_id), ['Id', 'owner_user_id']]\
         .groupby('owner_user_id').agg('count').shape[0]

781545

Only 781545 in the user subset we focused in (those who has a valid country field) actually asked a question in the past 4 years in stackoverflow, which is around 28% of the users.

In [19]:
# link the question to the user info, age field is not included as they are all na values
quesWithLocation = allUsers[['user_id', 'user_reputation', 'country_iso3']]\
                            .merge(questions, left_on='user_id', right_on='owner_user_id', how='inner')
# drop favourite count as the majority is NaN, tags is dropped as it is difficult to process
quesWithLocation.drop(columns=['owner_user_id', 'tags', 'favorite_count'], inplace=True)
quesWithLocation.rename(columns={'Id': 'question_id', 'creation_date': 'question_creation_date'\
                                 , 'score': 'question_score'}, inplace=True)

In [21]:
quesWithLocation.head()

Unnamed: 0,user_id,user_reputation,country_iso3,question_id,answer_count,comment_count,question_creation_date,question_score,view_count
0,7019478,1,SGP,40044740,2,0,2016-10-14 13:44:25.71 UTC,-6,381
1,7019478,1,SGP,42085965,0,2,2017-02-07 09:21:01.72 UTC,0,40
2,7019478,1,SGP,51300090,1,3,2018-07-12 07:50:10.133 UTC,0,439
3,7093392,1,USA,40711004,1,2,2016-11-21 00:13:00.823 UTC,0,44
4,7497308,1,NOR,41967673,0,6,2017-01-31 21:18:44.88 UTC,0,207


In [23]:
quesWithLocation.shape[0]

3414419

In [24]:
# quesWithLocation[quesWithLocation.country_iso3.isna()].shape[0]  # only 129291 fields are empty in country_iso
# drop the rows with empty country field
quesWithLocation = quesWithLocation[~quesWithLocation.country_iso3.isna()] # remaining 3285128

In [25]:
quesWithLocation.head(5)

Unnamed: 0,user_id,user_reputation,country_iso3,question_id,answer_count,comment_count,question_creation_date,question_score,view_count
0,7019478,1,SGP,40044740,2,0,2016-10-14 13:44:25.71 UTC,-6,381
1,7019478,1,SGP,42085965,0,2,2017-02-07 09:21:01.72 UTC,0,40
2,7019478,1,SGP,51300090,1,3,2018-07-12 07:50:10.133 UTC,0,439
3,7093392,1,USA,40711004,1,2,2016-11-21 00:13:00.823 UTC,0,44
4,7497308,1,NOR,41967673,0,6,2017-01-31 21:18:44.88 UTC,0,207


In [31]:
quesWithLocation[['user_id', 'question_id', 'country_iso3']].groupby(['country_iso3', 'user_id']).agg('count')\
                                            .sort_values(by='question_id', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,question_id
country_iso3,user_id,Unnamed: 2_level_1
USA,1223975,1219
RUS,258483,703
GBR,6703783,646
USA,995862,632
CHE,1743843,631
...,...,...
IND,8790882,1
IND,8790640,1
IND,8790573,1
IND,8790558,1


In [32]:
# saving the merged dataframe into pkl 
quesWithLocation.to_pickle('./question_with_location.pkl')

# Merging user data with answers

In [33]:
print(answers.shape[0])
answers.head(3)

10238155


Unnamed: 0,Id,comment_count,creation_date,owner_user_id,score,parent_id
0,56292528,0,2019-05-24 12:16:17.087 UTC,5674777.0,2,56275396
1,56292773,0,2019-05-24 12:31:26.12 UTC,8245406.0,2,56292148
2,56292786,0,2019-05-24 12:32:31.437 UTC,8199990.0,2,49432666


In [38]:
# there are 65346 na field in the owner id, they should be removed first
answers = answers[~answers.owner_user_id.isna()]
answers.shape[0] # 10172809 entries

10172809

In [39]:
allUsers.shape[0]

2784917

In [41]:
# distinct users count of all the answers
answers[['Id', 'owner_user_id']].groupby('owner_user_id').agg('count').shape[0]

1319455

In [44]:
# grouping the popular questions out
answers[['parent_id', 'Id']].groupby('parent_id').agg('count').sort_values(by='Id', ascending=False).head(5)

Unnamed: 0_level_0,Id
parent_id,Unnamed: 1_level_1
20915266,61
17054000,60
114543,58
46267621,58
50718018,57


In [45]:
quesWithLocation[quesWithLocation.answer_count < 1].shape[0]

677103

There are very popular questions that got over 60 answers. However, there are still a significant portion (~ 20%) of the questions that does not receive any answers in the past 4 years.

In [40]:
# unique user count with location count
answers.loc[answers.owner_user_id.isin(allUsers.user_id), ['Id', 'owner_user_id']]\
         .groupby('owner_user_id').agg('count').shape[0]

610070

Only 610070 in the user subset we focused in (those who has a valid country field) actually provide an answer in the past 4 years in stackoverflow, which is around 22% of the users in focus.

In [46]:
allUsers.head(3)

Unnamed: 0,user_id,age,creation_date,user_reputation,up_votes,down_votes,country_iso3
0,7019478,,2016-10-14 13:36:30.96 UTC,1,0,0,SGP
1,7074992,,2016-10-26 12:11:08.41 UTC,1,0,0,EGY
2,7093392,,2016-10-31 02:19:04.64 UTC,1,0,0,USA


In [49]:
answers.head(3)

Unnamed: 0,Id,comment_count,creation_date,owner_user_id,score,parent_id
0,56292528,0,2019-05-24 12:16:17.087 UTC,5674777.0,2,56275396
1,56292773,0,2019-05-24 12:31:26.12 UTC,8245406.0,2,56292148
2,56292786,0,2019-05-24 12:32:31.437 UTC,8199990.0,2,49432666


In [54]:
# link the answers to the user info
ansWithLocation = allUsers[['user_id', 'user_reputation', 'country_iso3']]\
                            .merge(answers, left_on='user_id', right_on='owner_user_id', how='inner') 
ansWithLocation.drop(columns='owner_user_id', inplace=True)
ansWithLocation.rename(columns={'parent_id': 'question_id', 'score': 'answer_score', 'Id': 'answer_id'}, inplace=True)
# 7171637 rows in ansWithLocation

In [61]:
ansWithLocation.head(3)

Unnamed: 0,user_id,user_reputation,country_iso3,answer_id,comment_count,creation_date,answer_score,question_id
0,7573797,1,IRN,44882800,0,2017-07-03 10:03:45.947 UTC,0,9783740
1,7573797,1,IRN,45163140,0,2017-07-18 09:47:13.027 UTC,0,24985627
2,4796241,1,FIN,57655419,0,2019-08-26 09:42:51.17 UTC,0,56212126


In [64]:
# again drop the rows that have country_iso3 is empty
# ansWithLocation[ansWithLocation.country_iso3.isna()].shape[0]       # 272328 rows have country field na

ansWithLocation = ansWithLocation[~ansWithLocation.country_iso3.isna()]
ansWithLocation.shape[0]                 # 6899309 rows are left after removing na country rows

6899309

In [66]:
# save the answers with location dataframe to pkl
ansWithLocation.to_pickle('./answer_with_location.pkl's)

# Separator

In [56]:
ansWithLocation[ansWithLocation.comment_count > 0].shape[0]

3457140

In [67]:
allUsers.isna().any()

user_id            False
age                 True
creation_date      False
user_reputation    False
up_votes           False
down_votes         False
country_iso3        True
dtype: bool

In [70]:
allUsers[~allUsers.country_iso3.isna()].shape[0]

2665796

In [71]:
allUsers[~allUsers.country_iso3.isna()].to_pickle('./user_without_null_country.pkl')