# Final Project: Phase 3: Manipulating Maps Data with Pandas

In [1]:
import numpy as np
import pandas as pd
import MySQLdb as mdb
import MySQL_data_file as MySQL_data

In [2]:
con = mdb.connect(MySQL_data.my_sql_host, MySQL_data.my_sql_user,\
                    MySQL_data.my_sql_passwd,\
                    MySQL_data.my_sql_database)
#con.close

In [3]:
def run_sql(query):
    cur = con.cursor()
    cur.execute(query)
    con.commit()
    #con.close()
    return cur.fetchall()

In [4]:
gmaps_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare
                        FROM gmaps_data""",con)
gmaps_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29628 entries, 0 to 29627
Data columns (total 9 columns):
entry_id        29628 non-null int64
timestamp       29628 non-null datetime64[ns]
datetime        29628 non-null datetime64[ns]
origins         29628 non-null object
destinations    29628 non-null object
travel_mode     29628 non-null object
duration        29454 non-null float64
distance        29454 non-null float64
fare            17128 non-null float64
dtypes: datetime64[ns](2), float64(3), int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
bing_df = pd.read_sql("""SELECT entry_id,timestamp,datetime,origins,destinations,travel_mode,duration_traffic,distance,congestion
                        FROM mmaps_data""",con)
bing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19080 entries, 0 to 19079
Data columns (total 9 columns):
entry_id            19080 non-null int64
timestamp           19080 non-null datetime64[ns]
datetime            19080 non-null datetime64[ns]
origins             19080 non-null object
destinations        19080 non-null object
travel_mode         19080 non-null object
duration_traffic    19080 non-null int64
distance            19080 non-null int64
congestion          19080 non-null object
dtypes: datetime64[ns](2), int64(3), object(4)
memory usage: 1.5+ MB


In [6]:
#changing the bing maps "duration_traffic" column to "duration" in order to match up the column with the gmaps_df
bing_df.columns = [u'entry_id', u'timestamp', u'datetime', u'origins', u'destinations', u'travel_mode', u'duration', u'distance', u'congestion']

In [7]:
combined_df = gmaps_df.merge(bing_df, how='outer')

In [8]:
combined_df = combined_df.copy()

In [9]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48708 entries, 0 to 48707
Data columns (total 10 columns):
entry_id        48708 non-null float64
timestamp       48708 non-null datetime64[ns]
datetime        48708 non-null datetime64[ns]
origins         48708 non-null object
destinations    48708 non-null object
travel_mode     48708 non-null object
duration        48534 non-null float64
distance        48534 non-null float64
fare            17128 non-null float64
congestion      19080 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.1+ MB


In [10]:
#can also use sql to look for null values: select * from gmaps_data where duration is NULL
combined_df = combined_df.ix[pd.notnull(combined_df['duration']),:].sort(columns='entry_id', ascending='True')

Note (from above): It appears that all data between April 22 and 25 was corrupted. Should delete or remove any references

In [11]:
print len(combined_df[pd.isnull(combined_df['duration'])])
print len(combined_df[pd.notnull(combined_df['duration'])])

0
48534


In [41]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48534 entries, 0 to 48707
Data columns (total 10 columns):
entry_id        48534 non-null float64
timestamp       48534 non-null datetime64[ns]
datetime        48534 non-null datetime64[ns]
origins         48534 non-null object
destinations    48534 non-null object
travel_mode     48534 non-null object
duration        48534 non-null float64
distance        48534 non-null float64
fare            17128 non-null float64
congestion      19080 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.1+ MB


In [40]:
combined_df.head()

Unnamed: 0,entry_id,timestamp,datetime,origins,destinations,travel_mode,duration,distance,fare,congestion
0,1,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,2661,11969,2.0,
1,1,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Pacific Heights, San Francisco, CA",driving,1750,23,,
2,2,2015-04-20 01:14:42,2015-04-20 08:14:42,"Outer Sunset, San Francisco, CA, USA","Oakland, CA, USA",transit,6056,30158,,
3,2,2015-04-27 02:32:58,2015-04-27 09:32:56,"Oakland, CA","Outer Richmond, San+Francisco, CA",driving,2089,27,,
4,3,2015-04-20 01:14:42,2015-04-20 08:14:42,"Noe Valley, San Francisco, CA, USA","Financial District, San Francisco, CA, USA",transit,3182,9355,2.0,


In [71]:
#need to figure out the select distinct for pandas to make sure that below we are getting out all of the oaklands
#combined_df[['origins','destinations']]
origins_sorted = np.sort(pd.Series.unique(combined_df['origins']))
for value in origins_sorted: print value

Berkeley, CA
Berkeley, CA, USA
Financial District, San Francisco, CA
Financial District, San Francisco, CA, USA
Mission District, San Francisco, CA
Mission District, San Francisco, CA, USA
Mountain View, CA
Mountain View, CA, USA
Noe Valley, San Francisco, CA, USA
Noe Valley, San+Francisco, CA
North Beach, San Francisco, CA
North Beach, San Francisco, CA, USA
Oakland, CA
Oakland, CA, USA
Outer Richmond, San Francisco, CA, USA
Outer Richmond, San+Francisco, CA
Outer Sunset, San Francisco, CA
Outer Sunset, San Francisco, CA, USA
Pacific Heights, San Francisco, CA
Pacific Heights, San Francisco, CA, USA
Russian Hill, San Francisco, CA
Russian Hill, San Francisco, CA, USA


In [120]:
print len(combined_df[~((combined_df['origins'] =='Oakland, CA, USA')\
                        & (combined_df['destinations'] == 'Oakland, CA, USA'))])
print len(combined_df[((combined_df['origins'] !='Oakland, CA, USA')\
                          | (combined_df['destinations'] != 'Oakland, CA, USA'))])

combined_df2 = combined_df[combined_df['distance'] != 0].copy()
print len(combined_df)

48036
48036
48534


<class 'pandas.core.frame.DataFrame'>
Int64Index: 48036 entries, 0 to 48707
Data columns (total 10 columns):
entry_id        48036 non-null float64
timestamp       48036 non-null datetime64[ns]
datetime        48036 non-null datetime64[ns]
origins         48036 non-null object
destinations    48036 non-null object
travel_mode     48036 non-null object
duration        48036 non-null float64
distance        48036 non-null float64
fare            17128 non-null float64
congestion      19080 non-null object
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 4.0+ MB


In [None]:
#write to csv
#df.to_csv('foo.csv')
#read from csv
#pd.read_csv('foo.csv')