## Data Wrangling: Merging Data

**Goal:** 

1. import all files as pandas dataframes, name = 'city' + 'listings' OR 'reviews' + number
2. concatenate all dataframes from the same city and listings OR reviews to remove duplicate data
3. export to csv and save in a new folder

In [1]:
# import relevant packages
import pandas as pd
import shutil
import os
import time

In [2]:
# concatenate listings and reviews data for each city
# export csv to target directory
def consolidate_data(city, directory, target):
    
    filename = city + '_listings.csv'
    # if listings file for this city doesn't already exist, create listings_df and save as csv
    if(not os.path.isfile(target + filename)):
        listings_df = combine_listings(city, directory)
        export_csv(city, filename, listings_df, target)
    
    filename = city + '_reviews.csv'
    # if reviews file for this city doesn't already exist, create reviews_df and save as csv
    if(not os.path.isfile(target + filename)):
        reviews_df = combine_reviews(city, directory)
        export_csv(city, filename, reviews_df, target)



In [3]:
#### FUNCTION FOR LISTINGS #### 
def combine_listings(city, directory):
    target_files = []
    
    for file in os.listdir(directory):
        # check if file from the target city and is listings data
        if city in file and 'listings' in file:
            # add to list of target files
            target_files.append(file)
            
    # concatenate files in list
    return concat_files(target_files, directory) 

In [4]:
#### FUNCTION FOR REVIEWS #### 
def combine_reviews(city, directory):
    target_files = []
    
    for file in os.listdir(directory):
        # check if file from the target city and is listings data
        if city in file and 'reviews' in file:
            # add to list of target files
            target_files.append(file)
            
    # concatenate files in list
    return concat_files(target_files, directory) 

In [5]:
def concat_files(file_list, directory):
    all_files = []
    
    for file in file_list:
        # make into a pandas dataframe
        df = pd.read_csv(directory + file)
        
        # add column of the date
        df['date_recorded'] = file.split('_')[1]
        
        # append to a list of dataframes
        all_files.append(df)
    
    # append dataframes together along x-axis
    concat_all = pd.concat(all_files)
    # get rid of duplicates
    unique_all = concat_all.drop_duplicates()
    # reset index
    unique_all.reset_index(drop=True, inplace=True)
    return unique_all

In [6]:
def export_csv(city, filename, df, target):
    current_dir = os.getcwd() + '/' + filename
    # export listings dataframe to csv if file doesn't already exist
    if(not os.path.isfile(current_dir)):
        df.to_csv(filename, index=False)
        # move csv to target directory
        shutil.move(current_dir, target)

In [7]:
#### EXECUTION ON FILES ####
directory = '/Users/limesncoconuts2/springboard_data/data_capstone_one/web_scraped/'
target = '/Users/limesncoconuts2/springboard_data/data_capstone_one/csv/'

In [8]:
# get list of unique cities in alphabetical order
unique_cities = []
for file in os.listdir(directory):
    unique_cities.append(file.split('_')[0])
unique_cities = list(set(unique_cities))
unique_cities.sort()
print(len(unique_cities), ' cities')
print(unique_cities)

84  cities
['amsterdam', 'antwerp', 'asheville', 'athens', 'austin', 'barcelona', 'barossa-valley', 'barwon-south-west-vic', 'beijing', 'bergamo', 'berlin', 'bologna', 'bordeaux', 'boston', 'bristol', 'brussels', 'cambridge', 'cape-town', 'chicago', 'clark-county-nv', 'columbus', 'copenhagen', 'denver', 'dublin', 'edinburgh', 'euskadi', 'florence', 'geneva', 'ghent', 'girona', 'greater-manchester', 'hawaii', 'hong-kong', 'istanbul', 'lisbon', 'london', 'los-angeles', 'lyon', 'madrid', 'malaga', 'mallorca', 'manchester', 'melbourne', 'menorca', 'milan', 'montreal', 'naples', 'nashville', 'new-orleans', 'new-york-city', 'northern-rivers', 'oakland', 'oslo', 'pacific-grove', 'paris', 'portland', 'porto', 'prague', 'puglia', 'quebec-city', 'rhode-island', 'rio-de-janeiro', 'rome', 'salem-or', 'san-diego', 'san-francisco', 'santa-clara-county', 'santa-cruz-county', 'seattle', 'sevilla', 'sicily', 'stockholm', 'sydney', 'taipei', 'tasmania', 'toronto', 'trentino', 'twin-cities-msa', 'vancouv

In [9]:
for city in unique_cities:
    print(city) # to see which cities' files have been processed
    # if both files haven't been created, continue to create the consolidated csv files for that city
    if(not os.path.isfile(target + city + '_listings.csv') or not os.path.isfile(target + city + '_reviews.csv')):
        consolidate_data(city, directory, target)

amsterdam
antwerp
asheville
athens
austin
barcelona
barossa-valley
barwon-south-west-vic
beijing
bergamo
berlin
bologna
bordeaux
boston
bristol
brussels
cambridge
cape-town
chicago
clark-county-nv
columbus
copenhagen
denver
dublin
edinburgh
euskadi
florence
geneva
ghent
girona
greater-manchester
hawaii
hong-kong
istanbul
lisbon
london
los-angeles
lyon
madrid
malaga
mallorca
manchester
melbourne
menorca
milan
montreal
naples
nashville
new-orleans
new-york-city
northern-rivers
oakland
oslo
pacific-grove
paris
portland


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


porto
prague
puglia


  """
  """
  """


quebec-city
rhode-island
rio-de-janeiro


  """
  """


rome
salem-or
san-diego


  """


san-francisco
santa-clara-county
santa-cruz-county
seattle
sevilla
sicily
stockholm
sydney


  """
  """
  """
  """
  """
  """


taipei
tasmania
toronto
trentino
twin-cities-msa
vancouver
venice
victoria
vienna
washington-dc


  """


western-australia


  """
