# Final Project: Phase 3: Manipulating Maps Data with Pandas

In [1]:
import numpy as np
import pandas as pd
import MySQLdb as mdb
import MySQL_data_file as MySQL_data

In [2]:
con = mdb.connect(MySQL_data.my_sql_host, MySQL_data.my_sql_user,\
                    MySQL_data.my_sql_passwd,\
                    MySQL_data.my_sql_database)
#con.close

In [3]:
def run_sql(query):
    cur = con.cursor()
    cur.execute(query)
    con.commit()
    #con.close()
    return cur.fetchall()

####Read in the data from SQL and combine into a single DataFrame

In [182]:
gmaps_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare
                        FROM gmaps_data""",con)
bing_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration_traffic,distance,congestion
                        FROM mmaps_data""",con)
#changing the bing maps "duration_traffic" column to "duration" in order to match up the column with the gmaps_df
bing_df.columns = [u'entry_id', u'timestamp', u'datetime', u'origins', u'destinations', u'travel_mode', u'duration', u'distance', u'congestion']
combined_df = gmaps_df.merge(bing_df, how='outer')
combined_df = combined_df.copy()
#combined_df.info()

####Remove all rows in the combined DataFrame where duration is Null

Note: It appears that all data between April 22 and 25 was corrupted. Should delete or remove any references

In [183]:
#can also use sql to look for null values: select * from gmaps_data where duration is NULL
combined_df = combined_df.ix[pd.notnull(combined_df['duration']),:].sort(columns='entry_id', ascending='True').copy()

In [184]:
print len(combined_df[pd.isnull(combined_df['duration'])])
print len(combined_df[pd.notnull(combined_df['duration'])])
#combined_df.info()

0
58248


In [13]:
combined_df.head()

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare,congestion
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,2661,11969,2.0,
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Pacific Heights, San Francisco, CA",driving,1750,23,,
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Oakland, CA, USA",transit,6056,30158,,
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Richmond, San+Francisco, CA",driving,2089,27,,
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,"Noe Valley, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,3182,9355,2.0,


####The code below separates out all rows where the origin and destination are Oakland. This only applies to gmaps data.

In [187]:
#Using ~ to select opposite
#print len(combined_df)
#print len(combined_df[((combined_df['origins'] =='Oakland, CA, USA')\
#                        & (combined_df['destinations'] == 'Oakland, CA, USA'))])
'''
print len(combined_df[~((combined_df['origins'] =='Oakland, CA, USA')\
                        & (combined_df['destinations'] == 'Oakland, CA, USA'))])
#Below code produces the same as the previous selection
print len(combined_df[((combined_df['origins'] !='Oakland, CA, USA')\
                          | (combined_df['destinations'] != 'Oakland, CA, USA'))])
'''
#The below code gets rid of all rows where the duration value is 0 (e.g. Oakland to Oakland trips)
combined_df2 = combined_df[combined_df['distance'] != 0].copy()
#print len(combined_df2)
#combined_df2.head()

####Normalizing origin/destination names by replacing with shorter names

In [188]:
combined_df3 = combined_df2.copy()

In [197]:
#The following code displays/stores the distinct names in the origins and destinations
origins_sorted = np.sort(pd.Series.unique(combined_df3['origins']))
destinations_sorted = np.sort(pd.Series.unique(combined_df3['destinations']))
#for value in origins_sorted: print value
#for value in destinations_sorted: print value

In [190]:
#this is the list of names that will be replacing longer names for origins/destinations
new_names = ['Oakland','Berkeley', 'Financial District', 'Mission',\
             'Mountain View', 'Noe Valley', 'North Beach', 'Russian Hill',\
             'Pacific Heights', 'Outer Sunset', 'Outer Richmond']

In [191]:
#the following code normalizes the district and city names in the origin and destination columns,
#replaces longer names with shorter names from list above
for short_name in new_names:
    combined_df3.ix[combined_df3['origins'].str.contains(short_name), 'origins'] = short_name
    combined_df3.ix[combined_df3['destinations'].str.contains(short_name), 'destinations'] = short_name

In [198]:
#check to make sure that name normalization was successful
#combined_df3.head()
#print np.sort(pd.Series.unique(combined_df3['origins']))
#print np.sort(pd.Series.unique(combined_df3['destinations']))
##################################
#making sure that there were no incidents in updating the origins and destinations
#by looking to see if any origin/destination pairs are equivalent
len(combined_df3[combined_df3['origins'] != combined_df3['destinations']])

57750

####Let's focus now on what is most important: duration (or travel time between origin and destination)

In [195]:
combined_df3.ix[:,0:7].head() #focusing data in on the duration value (let's forget about distance, etc.)

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,Outer Sunset,Financial District,transit,2661
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,Oakland,Pacific Heights,driving,1750
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,Outer Sunset,Oakland,transit,6056
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,Oakland,Outer Richmond,driving,2089
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,Noe Valley,Financial District,transit,3182


####Now let's work to separate out weekends from the data (perhaps create a separate dataframe for weekends/weekdays)

In [None]:
combined_df3.to_csv('maps_may12_afternoon_

In [None]:
#write to csv
#df.to_csv('foo.csv')
#read from csv
#pd.read_csv('foo.csv')