## Data Wrangling: Combining Data

This notebook follows the work completed in the Data Wrangling: Web Scraping notebook. Now that the raw files have been created and stored, the data will be further consolidated into their listings and reviews categories in the form of the pandas dataframe.

In [1]:
# import relevant packages
import pandas as pd
import shutil
import os
import time

### Functions to Consolidate the Data
Because there are hundreds of raw files to process with hundreds of thousands of lines of data, it helps to create functions that will do the heavy lifting for us. This heavy lifing includes:
1. Checking if the consolidated csv files we want already exist on the computer: **consolidate_data**.
2. Concatenating data of the same city and category (listings or reviews) together: **combine_listings, combine_reviews, and concat_files**.
3. Saving the concatenated data as a csv file: **export_csv**.

In [2]:
def consolidate_data(city, directory, target):
    """ Checks if the csv file for either listings
        or reviews data has been created for the designated
        city in the target folder.
        If the file has not been created, run the combine_listings
        or combine_reviews function for that city, and then create
        the csv file for that city.
    """
    
    filename = city + '_listings.csv'
    # if listings file for this city doesn't already exist, create listings_df and save as csv
    if(not os.path.isfile(target + filename)):
        listings_df = combine_listings(city, directory)
        export_csv(city, filename, listings_df, target)
    
    filename = city + '_reviews.csv'
    # if reviews file for this city doesn't already exist, create reviews_df and save as csv
    if(not os.path.isfile(target + filename)):
        reviews_df = combine_reviews(city, directory)
        export_csv(city, filename, reviews_df, target)



In [3]:
#### FUNCTION FOR LISTINGS #### 
def combine_listings(city, directory):
    """ Goes through files in the directory and checks for the
        designated city listings files. Appends the names of the
        listings files of that city to a list, and passes the list
        and the directory name to the concat_files function.
    """
    
    target_files = []
    
    for file in os.listdir(directory):
        # check if file from the target city and is listings data
        if city in file and 'listings' in file:
            # add to list of target files
            target_files.append(file)
            
    # concatenate files in list
    return concat_files(target_files, directory) 

In [4]:
#### FUNCTION FOR REVIEWS #### 
def combine_reviews(city, directory):
    """ Goes through files in the directory and checks for the
        designated city reviews files. Add the names of the
        reviews files of that city to a list, and passes the list
        and the directory name to the concat_files function.
    """
    target_files = []
    
    for file in os.listdir(directory):
        # check if file from the target city and is listings data
        if city in file and 'reviews' in file:
            # add to list of target files
            target_files.append(file)
            
    # concatenate files in list
    return concat_files(target_files, directory) 

In [5]:
def concat_files(file_list, directory):
    """Creates a pandas dataframe for each file name in the 
       list of files, then adds the date recorded as a column
       in that dataframe (taken from the file name). Appends
       the dataframe to a list of dataframes. After all files
       in the list have been converted to pandas dataframes,
       concatenate the dataframes together, drop duplicates,
       and reset the dataframe index.
    """
    
    all_files = []
    
    for file in file_list:
        # make into a pandas dataframe
        df = pd.read_csv(directory + file)
        
        # add column of the date
        df['date_recorded'] = file.split('_')[1]
        
        # append to a list of dataframes
        all_files.append(df)
    
    # append dataframes together along x-axis
    concat_all = pd.concat(all_files)
    # get rid of duplicates
    unique_all = concat_all.drop_duplicates()
    # reset index
    unique_all.reset_index(drop=True, inplace=True)
    return unique_all

In [6]:
def export_csv(city, filename, df, target):
    """ If the desired csv file does not exist in the current
        working directory, convert the dataframe to a csv file
        and move the the desired folder in the target directory.
    """
    current_dir = os.getcwd() + '/' + filename
    # export listings dataframe to csv if file doesn't already exist
    if(not os.path.isfile(current_dir)):
        df.to_csv(filename, index=False)
        # move csv to target directory
        shutil.move(current_dir, target)

#### The following code uses the above functions on the project data:

In [7]:
# identify the directory and target folder
directory = '/Users/limesncoconuts2/springboard_data/data_capstone_one/web_scraped/'
target = '/Users/limesncoconuts2/springboard_data/data_capstone_one/csv/'

In [8]:
# get list of unique cities in alphabetical order
unique_cities = []
for file in os.listdir(directory):
    unique_cities.append(file.split('_')[0])
unique_cities = list(set(unique_cities))
unique_cities.sort()
print(len(unique_cities), ' cities')
print(unique_cities)

84  cities
['amsterdam', 'antwerp', 'asheville', 'athens', 'austin', 'barcelona', 'barossa-valley', 'barwon-south-west-vic', 'beijing', 'bergamo', 'berlin', 'bologna', 'bordeaux', 'boston', 'bristol', 'brussels', 'cambridge', 'cape-town', 'chicago', 'clark-county-nv', 'columbus', 'copenhagen', 'denver', 'dublin', 'edinburgh', 'euskadi', 'florence', 'geneva', 'ghent', 'girona', 'greater-manchester', 'hawaii', 'hong-kong', 'istanbul', 'lisbon', 'london', 'los-angeles', 'lyon', 'madrid', 'malaga', 'mallorca', 'manchester', 'melbourne', 'menorca', 'milan', 'montreal', 'naples', 'nashville', 'new-orleans', 'new-york-city', 'northern-rivers', 'oakland', 'oslo', 'pacific-grove', 'paris', 'portland', 'porto', 'prague', 'puglia', 'quebec-city', 'rhode-island', 'rio-de-janeiro', 'rome', 'salem-or', 'san-diego', 'san-francisco', 'santa-clara-county', 'santa-cruz-county', 'seattle', 'sevilla', 'sicily', 'stockholm', 'sydney', 'taipei', 'tasmania', 'toronto', 'trentino', 'twin-cities-msa', 'vancouv

In [None]:
# run function on the list of cities 
for city in unique_cities:
    # if both files haven't been created, continue to create the consolidated csv files for that city
    if(not os.path.isfile(target + city + '_listings.csv') or not os.path.isfile(target + city + '_reviews.csv')):
        consolidate_data(city, directory, target)