In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Loading dictinary from file

In [5]:
uni_dict = np.load('example_dict.npy', allow_pickle=True).item()

In [6]:
uni_dict

{'Boston': ['Northeastern University', 'Boston University'],
 'London': ['UCL',
  'Kings College',
  'Queen Mary University',
  'Royal Holloway University'],
 'Oxford': ['Oxford University']}

In [7]:
uni_dict.keys()

dict_keys(['Boston', 'London', 'Oxford'])

#### dictionary to dataframe

In [8]:
df_rows = []
for k,v in uni_dict.items():
    print('key', k)
    print('value',v)
    for vv in v:
        print('this will be a row: ',[k,vv])
        df_rows.append([k,vv])

key Boston
value ['Northeastern University', 'Boston University']
this will be a row:  ['Boston', 'Northeastern University']
this will be a row:  ['Boston', 'Boston University']
key London
value ['UCL', 'Kings College', 'Queen Mary University', 'Royal Holloway University']
this will be a row:  ['London', 'UCL']
this will be a row:  ['London', 'Kings College']
this will be a row:  ['London', 'Queen Mary University']
this will be a row:  ['London', 'Royal Holloway University']
key Oxford
value ['Oxford University']
this will be a row:  ['Oxford', 'Oxford University']


In [9]:
df_rows

[['Boston', 'Northeastern University'],
 ['Boston', 'Boston University'],
 ['London', 'UCL'],
 ['London', 'Kings College'],
 ['London', 'Queen Mary University'],
 ['London', 'Royal Holloway University'],
 ['Oxford', 'Oxford University']]

In [10]:
uni_df = pd.DataFrame(df_rows, columns=['city','university'])

In [11]:
uni_df

Unnamed: 0,city,university
0,Boston,Northeastern University
1,Boston,Boston University
2,London,UCL
3,London,Kings College
4,London,Queen Mary University
5,London,Royal Holloway University
6,Oxford,Oxford University


#### apply function to column

In [14]:
uni_df['university']

0      Northeastern University
1            Boston University
2                          UCL
3                Kings College
4        Queen Mary University
5    Royal Holloway University
6            Oxford University
Name: university, dtype: object

In [15]:
x = uni_df['university'].iloc[0]
x

'Northeastern University'

In [16]:
x.split(' ')[0]

'Northeastern'

In [17]:
uni_df['university'].apply(lambda x: x.split(' ')[0])

0    Northeastern
1          Boston
2             UCL
3           Kings
4           Queen
5           Royal
6          Oxford
Name: university, dtype: object

In [18]:
x = uni_df['university'].iloc[5]
x

'Royal Holloway University'

In [19]:
x.replace(' University','')

'Royal Holloway'

In [20]:
uni_df['university'].apply(lambda x: x.replace(' University',''))

0      Northeastern
1            Boston
2               UCL
3     Kings College
4        Queen Mary
5    Royal Holloway
6            Oxford
Name: university, dtype: object

In [21]:
uni_df

Unnamed: 0,city,university
0,Boston,Northeastern University
1,Boston,Boston University
2,London,UCL
3,London,Kings College
4,London,Queen Mary University
5,London,Royal Holloway University
6,Oxford,Oxford University


In [22]:
uni_df['univeristy_name'] = uni_df['university'].apply(lambda x: x.replace(' University',''))
uni_df

Unnamed: 0,city,university,univeristy_name
0,Boston,Northeastern University,Northeastern
1,Boston,Boston University,Boston
2,London,UCL,UCL
3,London,Kings College,Kings College
4,London,Queen Mary University,Queen Mary
5,London,Royal Holloway University,Royal Holloway
6,Oxford,Oxford University,Oxford


#### apply function to multiple columns

In [23]:
uni_df.apply(lambda x: x.city + '_' + x.university, axis=1)

0      Boston_Northeastern University
1            Boston_Boston University
2                          London_UCL
3                London_Kings College
4        London_Queen Mary University
5    London_Royal Holloway University
6            Oxford_Oxford University
dtype: object

In [24]:
uni_df['city_plus_uni'] = uni_df.apply(lambda x: x.city + '_' + x.university, axis=1)
uni_df

Unnamed: 0,city,university,univeristy_name,city_plus_uni
0,Boston,Northeastern University,Northeastern,Boston_Northeastern University
1,Boston,Boston University,Boston,Boston_Boston University
2,London,UCL,UCL,London_UCL
3,London,Kings College,Kings College,London_Kings College
4,London,Queen Mary University,Queen Mary,London_Queen Mary University
5,London,Royal Holloway University,Royal Holloway,London_Royal Holloway University
6,Oxford,Oxford University,Oxford,Oxford_Oxford University


#### merging 


In [25]:
df_x = uni_df[['city', 'university']].copy()
df_y = pd.DataFrame({'city':['London', 'Boston','New York'],
                    'country':['UK','USA','USA']})

In [26]:
df_x

Unnamed: 0,city,university
0,Boston,Northeastern University
1,Boston,Boston University
2,London,UCL
3,London,Kings College
4,London,Queen Mary University
5,London,Royal Holloway University
6,Oxford,Oxford University


In [27]:
df_y

Unnamed: 0,city,country
0,London,UK
1,Boston,USA
2,New York,USA


<img src="join.png">

In [28]:
pd.merge(df_x, df_y, how='inner')

Unnamed: 0,city,university,country
0,Boston,Northeastern University,USA
1,Boston,Boston University,USA
2,London,UCL,UK
3,London,Kings College,UK
4,London,Queen Mary University,UK
5,London,Royal Holloway University,UK


In [29]:
pd.merge(df_x, df_y, how='outer')

Unnamed: 0,city,university,country
0,Boston,Northeastern University,USA
1,Boston,Boston University,USA
2,London,UCL,UK
3,London,Kings College,UK
4,London,Queen Mary University,UK
5,London,Royal Holloway University,UK
6,Oxford,Oxford University,
7,New York,,USA


In [30]:
pd.merge(df_x, df_y, how='left')

Unnamed: 0,city,university,country
0,Boston,Northeastern University,USA
1,Boston,Boston University,USA
2,London,UCL,UK
3,London,Kings College,UK
4,London,Queen Mary University,UK
5,London,Royal Holloway University,UK
6,Oxford,Oxford University,


In [31]:
pd.merge(df_x, df_y, how='right')

Unnamed: 0,city,university,country
0,Boston,Northeastern University,USA
1,Boston,Boston University,USA
2,London,UCL,UK
3,London,Kings College,UK
4,London,Queen Mary University,UK
5,London,Royal Holloway University,UK
6,New York,,USA


In [32]:
df_y = pd.DataFrame({'city_name':['London', 'Boston','New York'],
                    'country':['UK','USA','USA']})

In [33]:
df_y

Unnamed: 0,city_name,country
0,London,UK
1,Boston,USA
2,New York,USA


In [None]:
pd.merge(df_x, df_y, how='left')

In [44]:
uni_with_location = pd.merge(df_x, df_y, how='left', left_on='city', right_on='city_name')

In [45]:
uni_with_location

Unnamed: 0,city,university,city_name,country
0,Boston,Northeastern University,Boston,USA
1,Boston,Boston University,Boston,USA
2,London,UCL,London,UK
3,London,Kings College,London,UK
4,London,Queen Mary University,London,UK
5,London,Royal Holloway University,London,UK
6,Oxford,Oxford University,,


#### merging on multiple columns

In [40]:
gdp = pd.DataFrame({'city':['London','London','Boston','Boston','New York','New York'],
                    'year':[2017,2018,2017,2018,2017,2018],
                    'GDP':[1,2,3,4,5,6]})
scholarship = pd.DataFrame({'university':['Queen Mary University','Royal Holloway University','Boston University', 'Queen Mary University','Royal Holloway University','Boston University'],
                    'year':[2017,2017,2017,2018,2018,2018],
                    'scholarship_avg':[1,2,3,4,5,6]})

In [41]:
gdp

Unnamed: 0,city,year,GDP
0,London,2017,1
1,London,2018,2
2,Boston,2017,3
3,Boston,2018,4
4,New York,2017,5
5,New York,2018,6


In [42]:
scholarship

Unnamed: 0,university,year,scholarship_avg
0,Queen Mary University,2017,1
1,Royal Holloway University,2017,2
2,Boston University,2017,3
3,Queen Mary University,2018,4
4,Royal Holloway University,2018,5
5,Boston University,2018,6


In [46]:
uni_with_location

Unnamed: 0,city,university,city_name,country
0,Boston,Northeastern University,Boston,USA
1,Boston,Boston University,Boston,USA
2,London,UCL,London,UK
3,London,Kings College,London,UK
4,London,Queen Mary University,London,UK
5,London,Royal Holloway University,London,UK
6,Oxford,Oxford University,,


In [None]:
scholarship_with_city = pd.merge(scholarship, uni_with_location[['university','city']],
                                how='left')
scholarship_with_city

In [None]:
scholarship_with_city_GDP = pd.merge(scholarship_with_city, gdp,
                                    how='left', on=['city','year'])
scholarship_with_city_GDP


#### seaborn plots

In [None]:
import seaborn as sns
#https://seaborn.pydata.org/examples/index.html

In [None]:
scholarship_with_city_GDP.boxplot('GDP')

In [None]:
sns.boxplot(data=scholarship_with_city_GDP, x='GDP', orient='v')

In [None]:
scholarship_with_city_GDP.plot.bar(x='city',y='GDP')

In [None]:
sns.barplot(data=scholarship_with_city_GDP, x='city', y='GDP')

In [None]:
scholarship_with_city_GDP[['GDP']].hist()

In [None]:
sns.distplot(scholarship_with_city_GDP['GDP'])

In [None]:
scholarship_with_city_GDP.plot.scatter(x='scholarship_avg',y='GDP')

In [None]:
sns.jointplot(data=scholarship_with_city_GDP, x='scholarship_avg',y='GDP')

In [None]:
sns.lmplot(data=scholarship_with_city_GDP, x='scholarship_avg',y='GDP')

In [None]:
sns.lmplot(data=scholarship_with_city_GDP, x='scholarship_avg',y='GDP', hue='city')

In [None]:
sns.lmplot(data=scholarship_with_city_GDP, x='scholarship_avg',y='GDP', hue='city', col='year')

In [None]:
import bokeh
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
output_notebook()

In [None]:
p = figure(plot_width=600, plot_height=400)
p.scatter(x=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['scholarship_avg'],
           y=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['GDP'],
          color='lime', alpha=0.8, legend='London',
          marker='circle', size=15)
show(p)

In [None]:
p = figure( plot_width=600, plot_height=400)
p.scatter(x= scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['scholarship_avg'],
           y=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['GDP'],
          color='lime', alpha=0.8, legend='London',
          marker='circle', size=15)
p.scatter(x= scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='Boston']['scholarship_avg'],
           y=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='Boston']['GDP'],
          color='purple', alpha=0.8, legend='Boston',
         marker='circle', size=15)
p.legend.location = "top_left"
show(p)

In [None]:

p = figure( plot_width=600, plot_height=400)
p.title.text = 'Click on legend entries to hide the corresponding lines'

p.scatter(x= scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['scholarship_avg'],
           y=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']['GDP'],
          color='lime', alpha=0.8, legend='London',
          muted_color='lime', muted_alpha=0.2,
          marker='circle', size=15)
p.scatter(x= scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='Boston']['scholarship_avg'],
           y=scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='Boston']['GDP'],
          color='purple', alpha=0.8, legend='Boston',
          muted_color='purple', muted_alpha=0.6,
         marker='circle', size=15)


p.legend.location = "top_left"
p.legend.click_policy="mute"#"hide"

show(p)

In [None]:

p = figure( plot_width=600, plot_height=400)
p.title.text = "Mouse over the dots"

l_df = scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='London']
source = ColumnDataSource(l_df)
p.scatter(x= 'scholarship_avg',
           y='GDP',
          color='lime', alpha=0.8, legend='London',
          muted_color='lime', muted_alpha=0.2,
          marker='circle', size=15,
         source = source)

hover = HoverTool(tooltips=[("uni", "@university"),
                            ('year', '@year')])
p.add_tools(hover)

b_df = scholarship_with_city_GDP[scholarship_with_city_GDP['city']=='Boston']
source = ColumnDataSource(b_df)
p.scatter(x= 'scholarship_avg',
           y='GDP',
          color='purple', alpha=0.8, legend='Boston',
          muted_color='purple', muted_alpha=0.2,
         marker='circle', size=15,
         source = source)


p.add_tools(hover)
p.legend.location = "top_left"
p.legend.click_policy="mute"
#p.legend.click_policy="hide"

show(p)

### 1 - exercise

Load the zipcode_dict_corrected.npy file into a dictionary. <br>
Create a list (called df_rows) from the dictionary where each element looks like ['city','zipcode']. For each city there should be a row for all the corresponding zip codes.

In [57]:
zipcode_dict = np.load('zipcode_dict_corrected.npy', allow_pickle = True).item()
zipcode_dict

zipcode_dict.keys()
zipcode_dict.values()

df_rows = []
for k,v in zipcode_dict.items():
    print('key', k)
    print('value',v)
    for vv in v:
        print('this will be a row: ',[k,vv])
        df_rows.append([k,vv])

key San Jose
value ['94088', ',94089', ',94538', ',94560', ',95002', ',95008', ',95013', ',95035', ',95037', ',95050', ',95054', ',95101', ',95103', ',95106', ',95108', ',95109', ',95110', ',95111', ',95112', ',95113', ',95115', ',95116', ',95117', ',95118', ',95119', ',95120', ',95121', ',95122', ',95123', ',95124', ',95125', ',95126', ',95127', ',95128', ',95129', ',95130', ',95131', ',95132', ',95133', ',95134', ',95135', ',95136', ',95138', ',95139', ',95141', ',95148', ',95150', ',95151', ',95152', ',95153', ',95154', ',95155', ',95156', ',95157', ',95158', ',95160', ',95161', ',95164', ',95170', ',95172', ',95173', ',95190', ',95191', ',95192', ',95193', ',95194', ',95196']
this will be a row:  ['San Jose', '94088']
this will be a row:  ['San Jose', ',94089']
this will be a row:  ['San Jose', ',94538']
this will be a row:  ['San Jose', ',94560']
this will be a row:  ['San Jose', ',95002']
this will be a row:  ['San Jose', ',95008']
this will be a row:  ['San Jose', ',95013']
this

In [56]:
zipcode_dict.keys()
zipcode_dict.values()



dict_values([['94088', ',94089', ',94538', ',94560', ',95002', ',95008', ',95013', ',95035', ',95037', ',95050', ',95054', ',95101', ',95103', ',95106', ',95108', ',95109', ',95110', ',95111', ',95112', ',95113', ',95115', ',95116', ',95117', ',95118', ',95119', ',95120', ',95121', ',95122', ',95123', ',95124', ',95125', ',95126', ',95127', ',95128', ',95129', ',95130', ',95131', ',95132', ',95133', ',95134', ',95135', ',95136', ',95138', ',95139', ',95141', ',95148', ',95150', ',95151', ',95152', ',95153', ',95154', ',95155', ',95156', ',95157', ',95158', ',95160', ',95161', ',95164', ',95170', ',95172', ',95173', ',95190', ',95191', ',95192', ',95193', ',95194', ',95196'], ['94002', ',94061', ',94062', ',94063', ',94064', ',94065', ',94070'], ['94039', ',94040', ',94041', ',94042', ',94043', ',94085'], ['94020', ',94022', ',94024', ',94028', ',94301', ',94302', ',94303', ',94304', ',94306', ',95033'], ['94016', ',94102', ',94103', ',94104', ',94105', ',94107', ',94108', ',94109', ',9

### 1 - check yourself

In [58]:
if len(df_rows) == 146:
    print('Your list is correct')
else:
    print('Your list is NOT correct')

Your list is correct


### 2 - exercise

- Create a dataframe from the df_rows list, where the columns are called landmark az zipcode <br>
- Create a new column in the dataframe called zip that contains the values from the zipcode column without the ',' and converted into integer type
- Drop the zipcode column

### 2 - check yourself

In [None]:
from pandas.api.types import is_numeric_dtype
if is_numeric_dtype(zipdf['zip']) and len(zipdf) == 146:
    print('The dataframe is correct')
else:
    print('The dataframe is NOT correct')

### 3 - exercise
Load the trip, station and weather csv-s into pandas dataframes <br>
Join them on the appropriate columns and create a final dataframe called merged_df where there is a row for each trip with the weather conditions for the corresponding day in the corresponding city (landmark)

In [None]:
# Your code here



### 3 - check yourself

In [None]:
if sorted(merged_df['Events'].value_counts().tolist()) == [496, 793, 10389, 11689, 120057]:
    print('The dataframe is correct')
else:
    print('The dataframe is NOT correct')

### 4 - exercise

Create a new column in the merged_df called weekend where the value is True for Saturday and Sunday and False for the other days. If you want you can create helper column that contains the name of the day first

In [None]:
# Your code here



### 4 - check yourself

In [None]:
if sorted(merged_df['weekend'].value_counts().tolist()) == [22980, 121005]:
    print('The weekend column is correct')
else:
    print('The weekend column is NOT correct')

### 5 - exercise

We would like to plot the number of trips taken by Customers/Subscribers by the different weather conditions. Let's start with the Max_Temperature_F! <br>
Create a dataframe where we can find for each possible values of the Max_Temperature_F: how many trips were taken by Subscirption Type and weekend. It will be the easiest to plot if the dataframe is in long format: for each temperature value there are 4 rows: weekend - Customer, weekend - Subscriber, weekday - Customer, weekday - Subscriber. Name the columns containing the number of trips to num_trips.

In [None]:
# Your code here



In [None]:
toplot

### 5 - check yourself

In [None]:
if len(toplot) == 165:
    print('The dataframe is correct')
else:
    print('The dataframe is NOT correct')

### 6 - exercise
Use seaborn to lmplot to plot the relationship between the max temperature and the number of trips taken by Customers/Subscribers! Draw two separate plots for weekends and weekdays next to each other!

In [None]:
# Your code here



### 7 - exercise

Use a for loop to draw similar charts for all numeric weather conditions! Write one observation about what you see on the charts! It can be about one chart or the comparison of multiple charts!

In [None]:
# Your code here



### Bonus exercise

Use the Bokeh library to draw an interactive plot about the data! The chart should have at least one interactive element (e.g. hover, clickable legend...)