# Final Project: Phase 3: Manipulating Maps Data with Pandas

In [1]:
import numpy as np
import pandas as pd
import MySQLdb as mdb
import MySQL_data_file as MySQL_data

In [2]:
con = mdb.connect(MySQL_data.my_sql_host, MySQL_data.my_sql_user,\
                    MySQL_data.my_sql_passwd,\
                    MySQL_data.my_sql_database)
#con.close

In [3]:
def run_sql(query):
    cur = con.cursor()
    cur.execute(query)
    con.commit()
    #con.close()
    return cur.fetchall()

####Read in the data from SQL and combine into a single DataFrame

In [223]:
gmaps_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare
                        FROM gmaps_data""",con)
bing_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration_traffic,distance,congestion
                        FROM mmaps_data""",con)
#changing the bing maps "duration_traffic" column to "duration" in order to match up the column with the gmaps_df
bing_df.columns = [u'entry_id', u'timestamp', u'datetime', u'origins', u'destinations', u'travel_mode', u'duration', u'distance', u'congestion']
combined_df = gmaps_df.merge(bing_df, how='outer')
combined_df = combined_df.copy()
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58432 entries, 0 to 58431
Data columns (total 10 columns):
entry_id        58432 non-null float64
timestamp       58432 non-null datetime64[ns]
datetime        58432 non-null datetime64[ns]
origins         58432 non-null object
destinations    58432 non-null object
travel_mode     58432 non-null object
duration        58248 non-null float64
distance        58248 non-null float64
fare            19981 non-null float64
congestion      23942 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.9+ MB


####Remove all rows in the combined DataFrame where duration is Null

Note: It appears that all data between April 22 and 25 was corrupted. Should delete or remove any references

In [224]:
#can also use sql to look for null values: select * from gmaps_data where duration is NULL
combined_df = combined_df.ix[pd.notnull(combined_df['duration']),:].sort(columns='entry_id', ascending='True').copy()

In [225]:
print len(combined_df[pd.isnull(combined_df['duration'])])
print len(combined_df[pd.notnull(combined_df['duration'])])
#combined_df.info()

0
58248


In [226]:
combined_df.head()

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare,congestion
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,2661,11969,2.0,
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Pacific Heights, San Francisco, CA",driving,1750,23,,
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Oakland, CA, USA",transit,6056,30158,,
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Richmond, San+Francisco, CA",driving,2089,27,,
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,"Noe Valley, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,3182,9355,2.0,


####The code below separates out all rows where the origin and destination are Oakland. This only applies to gmaps data.

In [227]:
#Using ~ to select opposite
#print len(combined_df)
#print len(combined_df[((combined_df['origins'] =='Oakland, CA, USA')\
#                        & (combined_df['destinations'] == 'Oakland, CA, USA'))])
'''
print len(combined_df[~((combined_df['origins'] =='Oakland, CA, USA')\
                        & (combined_df['destinations'] == 'Oakland, CA, USA'))])
#Below code produces the same as the previous selection
print len(combined_df[((combined_df['origins'] !='Oakland, CA, USA')\
                          | (combined_df['destinations'] != 'Oakland, CA, USA'))])
'''
#The below code gets rid of all rows where the duration value is 0 (e.g. Oakland to Oakland trips)
combined_df2 = combined_df[combined_df['distance'] != 0].copy()
#print len(combined_df2)
#combined_df2.head()

####Normalizing origin/destination names by replacing with shorter names

In [237]:
combined_df3 = combined_df2.copy()

In [238]:
#The following code displays/stores the distinct names in the origins and destinations
origins_sorted = np.sort(pd.Series.unique(combined_df3['origins']))
destinations_sorted = np.sort(pd.Series.unique(combined_df3['destinations']))
#for value in origins_sorted: print value
#for value in destinations_sorted: print value

In [239]:
#this is the list of names that will be replacing longer names for origins/destinations
new_names = ['Oakland','Berkeley', 'Financial District', 'Mission',\
             'Mountain View', 'Noe Valley', 'North Beach', 'Russian Hill',\
             'Pacific Heights', 'Outer Sunset', 'Outer Richmond']

In [240]:
#the following code normalizes the district and city names in the origin and destination columns,
#replaces longer names with shorter names from list above
for short_name in new_names:
    combined_df3.ix[combined_df3['origins'].str.contains(short_name), 'origins'] = short_name
    combined_df3.ix[combined_df3['destinations'].str.contains(short_name), 'destinations'] = short_name

In [241]:
#check to make sure that name normalization was successful
#combined_df3.head()
#print np.sort(pd.Series.unique(combined_df3['origins']))
#print np.sort(pd.Series.unique(combined_df3['destinations']))
##################################
#making sure that there were no incidents in updating the origins and destinations
#by looking to see if any origin/destination pairs are equivalent
len(combined_df3[combined_df3['origins'] != combined_df3['destinations']])

57750

####Let's focus now on what is most important: duration (or travel time between origin and destination)

In [242]:
combined_df3.ix[:,0:7].head() #focusing data in on the duration value (let's forget about distance, etc.)

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,Outer Sunset,Financial District,transit,2661
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,Oakland,Pacific Heights,driving,1750
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,Outer Sunset,Oakland,transit,6056
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,Oakland,Outer Richmond,driving,2089
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,Noe Valley,Financial District,transit,3182


####Quick aside: practice saving the data in a csv file and reading it back into a dataframe)

In [243]:
combined_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57750 entries, 0 to 58431
Data columns (total 10 columns):
entry_id        57750 non-null float64
timestamp       57750 non-null datetime64[ns]
datetime        57750 non-null datetime64[ns]
origins         57750 non-null object
destinations    57750 non-null object
travel_mode     57750 non-null object
duration        57750 non-null float64
distance        57750 non-null float64
fare            19981 non-null float64
congestion      23942 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.8+ MB


In [696]:
#combined_df3.to_csv('maps_may12_afternoon.csv')
combined_df4 = pd.read_csv('maps_may12_afternoon.csv')
combined_df4.columns = [u'pandas_id',u'entry_id', u'timestamp', u'datetime', u'origins', u'destinations', u'travel_mode', u'duration', u'distance', u'fare', u'congestion']
combined_df4.drop(['entry_id','timestamp'], axis=1, inplace=True) #drop redundant entry_id's from MYSQL and replace with distinct pandas_id's

####Note: when reading in from csv, need to re-assign the timestamp and datetime columns as datetime objects

In [688]:
from datetime import datetime
from datetime import timedelta

In [697]:
#two ways to do this...can use "df.astype(dtype)" or "pd.to_datetime". The issue with using the 
#pandas "pd.to_datetime" is that it only applies to a series or list-like object
####################################################################
#combined_df4.loc[:,['timestamp', 'datetime']] = combined_df4[['timestamp', 'datetime']].astype('datetime64[ns]')
#combined_df4['timestamp'] = combined_df4['timestamp'].astype('datetime64[ns]')
#combined_df4['timestamp'] = pd.to_datetime(combined_df4['timestamp'])
#pd.to_datetime((combined_df4.loc[:,['datetime']].values))
#timestamp_values = combined_df4['timestamp'].values
datetime_values = combined_df4['datetime'].values
#combined_df4.loc[:,['timestamp']] = pd.to_datetime(timestamp_values)
combined_df4.loc[:,['datetime']] = pd.to_datetime(datetime_values)
combined_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57750 entries, 0 to 57749
Data columns (total 9 columns):
pandas_id       57750 non-null int64
datetime        57750 non-null datetime64[ns]
origins         57750 non-null object
destinations    57750 non-null object
travel_mode     57750 non-null object
duration        57750 non-null float64
distance        57750 non-null float64
fare            19981 non-null float64
congestion      23942 non-null object
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 4.4+ MB


####Now let's work to separate out weekends from the data (perhaps create a separate dataframe for weekends/weekdays)

Create dataframe containing only duration information for each route.

In [698]:
duration_df = combined_df4.iloc[:,:7].copy()
#duration_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57750 entries, 0 to 57749
Data columns (total 7 columns):
pandas_id       57750 non-null int64
datetime        57750 non-null datetime64[ns]
origins         57750 non-null object
destinations    57750 non-null object
travel_mode     57750 non-null object
duration        57750 non-null float64
distance        57750 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 3.5+ MB


In [718]:
data_datetime_range = duration_df.datetime
column_names = duration_df.columns

Unnamed: 0_level_0,pandas_id,datetime,origins,destinations,travel_mode,duration,distance
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-04-20 08:14:42,0,2015-04-20 08:14:42,Outer Sunset,Financial District,transit,2661,11969
2015-04-27 09:32:56,1,2015-04-27 09:32:56,Oakland,Pacific Heights,driving,1750,23
2015-04-20 08:14:42,2,2015-04-20 08:14:42,Outer Sunset,Oakland,transit,6056,30158
2015-04-27 09:32:56,3,2015-04-27 09:32:56,Oakland,Outer Richmond,driving,2089,27
2015-04-20 08:14:42,4,2015-04-20 08:14:42,Noe Valley,Financial District,transit,3182,9355


In [None]:
df_ts1 = pd.DataFrame(duration_df2.values, index=data_datetime_range, columns=column_names)
df_ts1.head()

In [735]:
df_ts1.index = df_ts1.index.tz_localize('UTC')

TypeError: Already tz-aware, use tz_convert to convert.

In [734]:
df_ts1.index = df_ts1.index.tz_convert('US/Pacific')

In [None]:
df_ts1.index

In [None]:
df_ts1.sort_index(axis=0, ascending=True, inplace=True)
df_ts1.sort(columns=['datetime','origins','destinations','travel_mode'], inplace=True)

Convert the time zone of the datetime column to PST (it is currently UTC)

In [444]:
#pd.to_datetime(duration_df['datetime'], utc=True)

In [477]:
#create separate arrays (columns) for a localized datetime, date, hour, minute

In [729]:
df_ts1[df_ts1['travel_mode'] == 'driving']

Unnamed: 0_level_0,pandas_id,datetime,origins,destinations,travel_mode,duration,distance
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-04-27 02:32:56-07:00,7,2015-04-27 09:32:56,Oakland,Mission,driving,1397,20
2015-04-27 02:32:56-07:00,9,2015-04-27 09:32:56,Oakland,Noe Valley,driving,1576,24
2015-04-27 02:32:56-07:00,3,2015-04-27 09:32:56,Oakland,Outer Richmond,driving,2089,27
2015-04-27 02:32:56-07:00,5,2015-04-27 09:32:56,Oakland,Outer Sunset,driving,2097,36
2015-04-27 02:32:56-07:00,1,2015-04-27 09:32:56,Oakland,Pacific Heights,driving,1750,23
2015-04-27 02:32:57-07:00,25,2015-04-27 09:32:57,Financial District,Berkeley,driving,1310,19
2015-04-27 02:32:57-07:00,19,2015-04-27 09:32:57,Financial District,Mission,driving,1038,5
2015-04-27 02:32:57-07:00,21,2015-04-27 09:32:57,Financial District,Noe Valley,driving,1215,9
2015-04-27 02:32:57-07:00,23,2015-04-27 09:32:57,Financial District,Oakland,driving,1120,17
2015-04-27 02:32:57-07:00,15,2015-04-27 09:32:57,Financial District,Outer Richmond,driving,1659,13


In [None]:
#write to csv
#df.to_csv('foo.csv')
#read from csv
#pd.read_csv('foo.csv')
'''
normalizaiton
regularization
regression
auto regression
arma
statsmodels
time series
'''