


#**Dataset**
##### Merge all the csvs containing data downloaded from reddit
##### Filter comments in order to have a dataset with limited dimension


In [1]:
#import all necessary packages
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
def comments_lengths(df):
  ''''
  input:
  dataframe of comments

  output:
  modified dataframe, for each comment the length is computed
  '''
  lengths=[]
  nulldf=df.isnull()
  for i in range(df.shape[0]):
    #if (nulldf.loc[i,"body"]) :
     # lengths.append(0)
   # else:
    lengths.append(len(df.body[i]))

  df['length']=lengths
  return df

In [6]:
def delete_removed(df):
  ''''
  input:
  dataframe of comments

  output:
  modified dataframe, all comments without body or with [removed] body are deleted
  '''
  mask = df['body'].apply(lambda x: pd.isnull(x) if pd.isnull(x) else  ((x.find("[removed") !=-1)|(x.find("[deleted") !=-1)))
  newdf = df.loc[~mask]
  newdf=newdf.reset_index(drop=True)
  return newdf

In [None]:
def select_comments(df, th_ups, min_len):
    ''''
  input:
  dataframe of comments
  threshold for ups
  threshold for length

  output:
  modified dataframe, comments that are shorter than min_len and with less ups than th_ups are deleted
  '''
  mask=(abs(df['ups'])< th_ups) & (df['length']<min_len)
  print(np.sum(mask))

  removed_comments=df['body'][mask]+"\n"
  text = "removing: " + " ".join(removed_comments[pd.notna(df['body'][mask])])
  df = df.loc[~mask]
 # print(text)
  return df

In [29]:
in_dir="drive/MyDrive/dataset/"
df_1_comments = pd.read_csv(in_dir+"reddit_comments_climate.csv")
df_2_comments = pd.read_csv(in_dir+"reddit_comments_clchange.csv")
df_3_comments = pd.read_csv(in_dir+"reddit_comments_globalwarming.csv")
df_4_comments = pd.read_csv(in_dir+"reddit_comments_sustainability_or_sustainable.csv")
# merging them
df= pd.concat([df_1_comments, df_2_comments, df_3_comments, df_4_comments])
#uncomment it in order to find the reduced dataset
#df=df_1_comments
df=df.reset_index(drop=True)
df.columns

Index(['Unnamed: 0', 'id', 'parent_id', 'created', 'subreddit', 'body',
       'author', 'author_fullname', 'ups', 'downs'],
      dtype='object')

In [30]:
print("Original shape: ", df.shape)
df=delete_removed(df)
print("Final shape: ",df.shape)

df=comments_lengths(df)

Original shape:  (80881, 10)
1827
Final shape:  (79054, 10)


In [None]:
#keep only the most relevant comments
df=select_comments(df, 1000, 300)

34825


In [10]:
# chainge the name of body column to title
df.columns = df.columns.str.replace("body", "title")

In [11]:
# delete rows with same title
df = df.drop_duplicates(subset='title', keep='first')

In [12]:
# delete an useless column
df = df.drop(['Unnamed: 0'], axis=1)
#add a column that identify comments
df = df.assign(category="comment")

In [13]:
# save all comments before deleting other unnecessary columns
"drive/MyDrive/lab2/"
df.to_csv(in_dir+"reddit_comments_total.csv", index=False)

In [14]:
# delete all the other useless columns
df = df.drop(['created', 'author', 'author_fullname', 'downs'], axis=1)

In [21]:
in_dir = "drive/MyDrive/dataset/"
df_1_posts = pd.read_csv(in_dir+"reddit_posts_climate.csv")
df_2_posts = pd.read_csv(in_dir+"reddit_posts_clchange.csv")
df_3_posts = pd.read_csv(in_dir+"reddit_posts_globalwarming.csv")
df_4_posts = pd.read_csv(in_dir+"reddit_posts_sustainability_or_sustainable.csv")
# merge them
df1= pd.concat([df_1_posts, df_2_posts, df_3_posts, df_4_posts])
df1=df1.reset_index(drop=True)
df1.columns

Index(['Unnamed: 0', 'id', 'created', 'subreddit', 'title', 'selftext',
       'author', 'author_fullname', 'upvote_ratio', 'ups', 'num_comments'],
      dtype='object')

In [22]:
df1 = df1.drop_duplicates(subset='title', keep='first')

In [23]:
# delete an useless column
df1 = df1.drop(['Unnamed: 0'], axis=1)
df1.to_csv(in_dir+"reddit_posts_subreddit.csv", index=False)
# delete all the other useless columns
df1 = df1.drop(['created', 'selftext', 'author', 'author_fullname', 'upvote_ratio'], axis=1)
df1 = df1.assign(category="post")
df_final = pd.concat([df, df1])
df_final.shape

(65080, 8)

In [24]:
df_final = df_final.drop_duplicates(subset='title', keep='first')

In [25]:
import re
# delete all comments that don't use latin alphabet
regex = re.compile(r'[a-zA-Z]')
df_final = df_final[df_final['title'].apply(lambda x: bool(regex.search(x)))]
df_final=df_final.reset_index(drop=True)
df_final.shape

(64967, 8)

In [26]:
#save the whole dataset
df_final.to_csv(in_dir+"reddit_dataset_total.csv", index=False)