# Forming Initial Raw Dataset

This notebook loads all the raw data found from FairFrame's scraped data and creates the initial dataset for further processing, and removes any exact duplicate reviews. The final output is in `all_reviews.p`

In [1]:
import pickle
import pandas as pd
import numpy as np
import os

## Loading PeoplePerHour files


In [23]:
# Points to data directory
path = '../data/'

In [33]:
directory = os.path.join(path, 'PeoplePerHour') 
all_reviews = []
for filename in os.listdir(directory):
    pickle_in = open(os.path.join(directory, filename), "rb")
    reviews = pickle.load(pickle_in)
    all_reviews += reviews

In [4]:
len(all_reviews)

4983

In [5]:
new_review_dict = {"user_no":[], "user_url":[], "reviews":[]}

In [6]:
for i in range(len(all_reviews)):
    row = all_reviews[i]
    for review in row["reviews"]:
        new_review_dict["user_no"].append(row["user_no"])
        new_review_dict["user_url"].append(row["user_url"])
        new_review_dict["reviews"].append(review)

In [7]:
PeoplePerHour_df = pd.DataFrame.from_dict(new_review_dict)
PeoplePerHour_df['gender'] = 'U'
PeoplePerHour_df = PeoplePerHour_df[['gender', 'reviews']]

## Loading 99Designs files

In [35]:
directory = os.path.join(path, '99Designs') 
all_reviews = []
for filename in os.listdir(directory):
    pickle_in = open(os.path.join(directory, filename), "rb")
    reviews = pickle.load(pickle_in)
    all_reviews += reviews

In [9]:
len(all_reviews)

1500

In [10]:
new_review_dict = {"user_no":[], "user_url":[], "reviews":[]}

In [11]:
for i in range(len(all_reviews)):
    row = all_reviews[i]
    for review in row["reviews"]:
        new_review_dict["user_no"].append(row["user_no"])
        new_review_dict["user_url"].append(row["user_url"])
        new_review_dict["reviews"].append(review[0][1:-1])

In [12]:
design_99_df = pd.DataFrame.from_dict(new_review_dict)
design_99_df['gender'] = 'U'
design_99_df = design_99_df[['gender', 'reviews']]

## Loading freelancer files

In [36]:
directory = os.path.join(path, 'Freelancer')
freelancer_df = pd.read_csv(os.path.join(directory, 'freelancer_gendered.csv'), index_col= 0, names=["gender", "reviews", "rating"], header = 0)

In [14]:
freelancer_df = freelancer_df[['gender', 'reviews']]

### Combining all the above reviews as a single list

In [15]:
PeoplePerHour_list = PeoplePerHour_df["reviews"].tolist()
design_99_list = design_99_df["reviews"].tolist()
freelancer_list = freelancer_df["reviews"].tolist()

In [16]:
all_combined = []
all_combined += PeoplePerHour_list
all_combined += design_99_list
all_combined += (freelancer_list)

In [17]:
len(all_combined)

118340

In [18]:
#Getting rid of all duplicates
filtered = list(set(all_combined))

In [19]:
len(filtered)

102868

## Loading concatenated file

In [24]:
reviews = pd.read_csv(os.path.join(path, 'concatenated.csv'), names = ["gender", "reviews"])

In [25]:
reviews = reviews.iloc[1:]
reviews = reviews.drop_duplicates(subset = 'reviews', keep = 'first')
reviews.shape

(20010, 2)

In [26]:
concatenated_list = reviews["reviews"].tolist()

In [27]:
len(filtered)

102868

In [28]:
len(concatenated_list)

20010

In [29]:
filtered += concatenated_list
filtered = list(set(filtered))

In [30]:
len(filtered)

111093

In [90]:
pickle.dump(filtered, open(os.path.join(path, 'all_reviews.p'),'wb'))