# Final Project: Phase 3: Manipulating Maps Data with Pandas

In [13]:
import numpy as np
import pandas as pd
import MySQLdb as mdb
import MySQL_data_file as MySQL_data

In [14]:
con = mdb.connect(MySQL_data.my_sql_host, MySQL_data.my_sql_user,\
                    MySQL_data.my_sql_passwd,\
                    MySQL_data.my_sql_database)
#con.close

In [15]:
def run_sql(query):
    cur = con.cursor()
    cur.execute(query)
    con.commit()
    #con.close()
    return cur.fetchall()

In [16]:
gmaps_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare
                        FROM gmaps_data""",con)
gmaps_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29628 entries, 0 to 29627
Data columns (total 9 columns):
entry_id        29628 non-null int64
timestamp       29628 non-null datetime64[ns]
datetime        29628 non-null datetime64[ns]
origins         29628 non-null object
destinations    29628 non-null object
travel_mode     29628 non-null object
duration        29454 non-null float64
distance        29454 non-null float64
fare            17128 non-null float64
dtypes: datetime64[ns](2), float64(3), int64(1), object(3)
memory usage: 2.3+ MB


In [17]:
bing_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration_traffic,distance,congestion
                        FROM mmaps_data""",con)
bing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19080 entries, 0 to 19079
Data columns (total 9 columns):
entry_id            19080 non-null int64
timestamp           19080 non-null datetime64[ns]
datetime            19080 non-null datetime64[ns]
origins             19080 non-null object
destinations        19080 non-null object
travel_mode         19080 non-null object
duration_traffic    19080 non-null int64
distance            19080 non-null int64
congestion          19080 non-null object
dtypes: datetime64[ns](2), int64(3), object(4)
memory usage: 1.5+ MB


In [18]:
#changing the bing maps "duration_traffic" column to "duration" in order to match up the column with the gmaps_df
bing_df.columns = [u'entry_id', u'timestamp', u'datetime', u'origins', u'destinations', u'travel_mode', u'duration', u'distance', u'congestion']

In [19]:
combined_df = gmaps_df.merge(bing_df, how='outer')

In [20]:
combined_df = combined_df.copy()

In [21]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48708 entries, 0 to 48707
Data columns (total 10 columns):
entry_id        48708 non-null float64
timestamp       48708 non-null datetime64[ns]
datetime        48708 non-null datetime64[ns]
origins         48708 non-null object
destinations    48708 non-null object
travel_mode     48708 non-null object
duration        48534 non-null float64
distance        48534 non-null float64
fare            17128 non-null float64
congestion      19080 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.1+ MB


In [42]:
#can also use sql to look for null values: select * from gmaps_data where duration is NULL
combined_df = combined_df.ix[pd.notnull(combined_df['duration']),:].sort(columns='entry_id', ascending='True')

Note (from above): It appears that all data between April 22 and 25 was corrupted. Should delete or remove any references

In [51]:
print len(combined_df[pd.isnull(combined_df['duration'])])
print len(combined_df[pd.notnull(combined_df['duration'])])

0
48534


In [47]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48534 entries, 0 to 48707
Data columns (total 10 columns):
entry_id        48534 non-null float64
timestamp       48534 non-null datetime64[ns]
datetime        48534 non-null datetime64[ns]
origins         48534 non-null object
destinations    48534 non-null object
travel_mode     48534 non-null object
duration        48534 non-null float64
distance        48534 non-null float64
fare            17128 non-null float64
congestion      19080 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.1+ MB


In [63]:
combined_df.head()

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare,congestion
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,2661,11969,2.0,
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Pacific Heights, San Francisco, CA",driving,1750,23,,
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Oakland, CA, USA",transit,6056,30158,,
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Richmond, San+Francisco, CA",driving,2089,27,,
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,"Noe Valley, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,3182,9355,2.0,


In [71]:
combined_df[combined_df['origins']=='Oakland, CA']

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare,congestion
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Pacific Heights, San Francisco, CA",driving,1750,23,,
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Richmond, San+Francisco, CA",driving,2089,27,,
5,3,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Sunset, San Francisco, CA",driving,2097,36,,
7,4,2015-04-27 02:32:59,2015-04-27 09:32:56,"Oakland, CA","Mission District, San Francisco, CA",driving,1397,20,,Mild
9,5,2015-04-27 02:32:59,2015-04-27 09:32:56,"Oakland, CA","Noe Valley, San+Francisco, CA",driving,1576,24,,
11,6,2015-04-27 02:32:59,2015-04-27 09:32:57,"Oakland, CA","Berkeley, CA",driving,856,12,,
71,36,2015-04-27 02:33:07,2015-04-27 09:33:07,"Oakland, CA","Financial District, San Francisco, CA",driving,1175,18,,
73,37,2015-04-27 02:33:07,2015-04-27 09:33:07,"Oakland, CA","Mountain View, CA",driving,2830,65,,
111,56,2015-04-27 02:59:05,2015-04-27 09:59:04,"Oakland, CA","Financial District, San Francisco, CA",driving,1178,18,,
113,57,2015-04-27 02:59:05,2015-04-27 09:59:04,"Oakland, CA","Mountain View, CA",driving,2813,65,,
