In [361]:
import pandas as pd

In [362]:
df = pd.read_csv('television_breakingbad.csv')

In [363]:
# Firstrly, I assign a name to each column to make the DataFrame easier to read
df.columns = ['ID', 'Text', 'PostID', 'Subreddit', 'Metareddit', 'PressTime', 'Author', 
           'Upvotes', 'Downvotes', 'LinkKarma', 'Karma', 'AuthorIsGold']


In [364]:
# Column "ID" is unnecessary, because there is a "Post ID"
df = df.drop("ID", axis=1)
# Furthermore, I will not use Subreddit and Metareddit columns since all threads are dedicated to one topic - The Breaking Bad series

In [365]:
# There is more posts than authors, so it makes sense to separate DataFrame on two ones

In [366]:
# Users with Karma rate
df_users = df[['Author','LinkKarma','Karma', 'AuthorIsGold']]
# Getting rid of deleted users
df_users = df_users[df_users['Author']!='[deleted]']

In [367]:
# Delete duplicates and assign UserID
df_users = df_users.drop_duplicates().reset_index(drop = True)
df_users['UserID'] = range(len(df_users))

In [368]:
# UserID should be first
df_users = df_users.reindex(columns = ['UserID', 'Author', 'LinkKarma', 'Karma', 'AuthorIsGold'])

In [369]:
# Post and its characteristic
df_posts = df[['PostID','Text','PressTime','Upvotes', 'Downvotes', 'Author']]

In [370]:
# Check if there are posts with empty text
df_posts[df_posts['Text'].isna()]

Unnamed: 0,PostID,Text,PressTime,Upvotes,Downvotes,Author
1,461m91,,1455612503,0,0,LordSadoth
3,462j49,,1455630894,0,0,NyanTortuga
6,4659c7,,1455665184,0,0,Golossos
8,45sdfi,,1455477088,302,0,Clouse
27,462nkh,,1455632848,0,0,sayitaintsoap
...,...,...,...,...,...,...
34158,461m91,,1455612503,0,0,LordSadoth
34171,44uzr8,,1454992844,190,0,balonibutt
34181,44u69n,,1454981106,38,0,PurelyCarbon
34194,452nbf,,1455110501,106,0,IvanMK


In [371]:
# Delete such rows
df_posts = df_posts[df_posts['Text'].notna()]

In [372]:
df_posts = df_posts.reset_index(drop = True)

In [373]:
df_posts['PostID'] = range(len(df_posts))

In [374]:
# Adding UserID column to posts

In [375]:
# Assign each username to UserID
author_to_id = dict(zip(df_users['Author'], df_users['UserID']))

In [376]:
# Write UserID in posts
df_posts['UserID'] = df_posts['Author'].map(author_to_id)

In [377]:
# Delete threads of deleted authors
df_posts = df_posts[df_posts['UserID'].notna()]

In [378]:
# Delete Author column in posts
df_posts = df_posts.drop('Author', axis = 1)

In [379]:
# Convert UserID to int
df_posts['UserID'] = df_posts['UserID'].astype(int)

In [381]:
# Change the time format
df_posts.loc[:, 'PressTime'] = pd.to_datetime(df_posts['PressTime'], unit = 's')