In [1]:
import pandas as pd
import numpy as np
import statistics as stats

In [2]:
data = pd.read_csv("../data/listings.csv")

In [3]:
# list of columns of interest
focusVariables = ["id", "accommodates", "bathrooms", "bedrooms", "beds", "price",
                  "number_of_reviews", "review_scores_rating"]

# Copy only comlumns to keep
cleanData = data[focusVariables].copy()

In [4]:
# Removes '$', ',', ' ', and replaces with '' (empty)
cleanData = cleanData.replace(["\$", ",", " "], "", regex=True)        

## Dummifiying Amenities Variables ##

In [5]:
# Create dummy variables
amenities = data['amenities']
amenities = amenities.replace(["{", "}", "\""], "", regex=True).replace([" "], "", regex=True)
amenities = amenities.str.split(",")

dummies = pd.get_dummies(amenities.apply(pd.Series).stack()).sum(level=0)

In [6]:
# Create correlation matrix
matrix = dummies.corr()
# Reshape matrix
corrValues = matrix.stack().reset_index()
corrValues.columns = ['Variable1','Variable2','Correlation']

# Remove diagonal values
corrValues = corrValues.loc[corrValues.Variable1 != corrValues.Variable2]
corrValues = corrValues.sort_values(by=['Correlation'], ascending=False)

In [7]:
potentialColl = corrValues.loc[corrValues.Correlation > 0.5]
potentialColl

Unnamed: 0,Variable1,Variable2,Correlation
1606,Washer,Dryer,0.95743
458,Dryer,Washer,0.95743
775,HairDryer,Hangers,0.821195
816,Hangers,HairDryer,0.821195
1026,Iron,HairDryer,0.817591
780,HairDryer,Iron,0.817591
1027,Iron,Hangers,0.794642
822,Hangers,Iron,0.794642
1111,LaptopFriendlyWorkspace,Hangers,0.763725
824,Hangers,LaptopFriendlyWorkspace,0.763725


In [8]:
# Removing collinear/irrelelvant dummy variables
dummies = dummies.drop(columns=['Washer/Dryer', 'Dryer', 'Internet', 'Petsliveonthisproperty'])

In [9]:
# Join dummy columns to clean dataframe
cleanData = cleanData.join(dummies)

## Removing Null Values and Stratifying Target Listings ##

In [10]:
# Remove null & NaN values
cleanData = cleanData.dropna()

In [11]:
# Create copy of only data with 80 or higher review scores rating
cleanDataTarget = cleanData[cleanData.review_scores_rating >= 80]

In [12]:
# Export clean data to CSV
export_filename = '../data/clean_listings.csv'
cleanData.to_csv(export_filename, index=False)

# Export clean data with 80 or higher review scores rating
export_filename = '../data/target_clean_listings.csv'
cleanDataTarget.to_csv(export_filename, index=False)