In [1]:
import pandas as pd
import numpy as np

In [2]:
# With the earthquakes.csv file, select all the earthquakes in Japan with a magType of mb and a magnitude of 4.9 or greater

In [3]:
earthquakes = pd.read_csv('earthquakes_exercise.csv')
earthquakes

Unnamed: 0,mag,magType,time,place,tsunami,parsed_place
0,1.35,ml,1539475168010,"9km NE of Aguanga, CA",0,California
1,1.29,ml,1539475129610,"9km NE of Aguanga, CA",0,California
2,3.42,ml,1539475062610,"8km NE of Aguanga, CA",0,California
3,0.44,ml,1539474978070,"9km NE of Aguanga, CA",0,California
4,2.16,md,1539474716050,"10km NW of Avenal, CA",0,California
...,...,...,...,...,...,...
9327,0.62,md,1537230228060,"9km ENE of Mammoth Lakes, CA",0,California
9328,1.00,ml,1537230135130,"3km W of Julian, CA",0,California
9329,2.40,md,1537229908180,"35km NNE of Hatillo, Puerto Rico",0,Puerto Rico
9330,1.10,ml,1537229545350,"9km NE of Aguanga, CA",0,California


In [4]:
earthquakes.query('parsed_place == "Japan" and magType == "mb" and mag >= 4.9')

Unnamed: 0,mag,magType,time,place,tsunami,parsed_place
1563,4.9,mb,1538977532250,"293km ESE of Iwo Jima, Japan",0,Japan
2576,5.4,mb,1538697528010,"37km E of Tomakomai, Japan",0,Japan
3072,4.9,mb,1538579732490,"15km ENE of Hasaki, Japan",0,Japan
3632,4.9,mb,1538450871260,"53km ESE of Hitachi, Japan",0,Japan


In [5]:
# Create bins for each full number of magnitude (for example, the first bin is 0-1, the second is 1-2, and so on) with a magType of ml and count how many are in each bin.

In [6]:
earthquakes.query('magType == "ml"').assign(mag_bin=lambda x: pd.cut(x.mag, np.arange(0, 10))).mag_bin.value_counts()

(1, 2]    3105
(0, 1]    2207
(2, 3]     862
(3, 4]     122
(4, 5]       2
(5, 6]       1
(8, 9]       0
(7, 8]       0
(6, 7]       0
Name: mag_bin, dtype: int64

In [8]:
# Using faang.csv file, group by the ticker and resample to monthly frequency. 
# Aggregate the open and close prices with the mean, the high price with the max, the low price with the min, and the volume with the sum

In [24]:
faang = pd.read_csv('faang.csv', parse_dates=['date'], index_col='date')
faang.head()

Unnamed: 0_level_0,high,low,open,close,volume,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,43.075001,42.314999,42.540001,43.064999,102223600.0,AAPL
2018-01-03,43.637501,42.990002,43.1325,43.057499,118071600.0,AAPL
2018-01-04,43.3675,43.02,43.134998,43.2575,89738400.0,AAPL
2018-01-05,43.842499,43.262501,43.360001,43.75,94640000.0,AAPL
2018-01-08,43.9025,43.482498,43.587502,43.587502,82271200.0,AAPL


In [25]:
faang.dtypes

high      float64
low       float64
open      float64
close     float64
volume    float64
ticker     object
dtype: object

In [26]:
faang.groupby('ticker').resample('1M').agg({'open':np.mean, 'high':np.max, 'low':np.min, 'close':np.mean, 'volume':np.sum})

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2018-01-31,43.505357,45.025002,41.174999,43.501309,2638718000.0
AAPL,2018-02-28,41.819079,45.154999,37.560001,41.909737,3711577000.0
AAPL,2018-03-31,43.761786,45.875,41.235001,43.624048,2854911000.0
AAPL,2018-04-30,42.44131,44.735001,40.157501,42.458572,2664617000.0
AAPL,2018-05-31,46.239091,47.592499,41.317501,46.384205,2483905000.0
AAPL,2018-06-30,47.180119,48.549999,45.182499,47.155357,2110498000.0
AAPL,2018-07-31,47.549048,48.990002,45.855,47.577857,1574766000.0
AAPL,2018-08-31,53.121739,57.217499,49.327499,53.336522,2801276000.0
AAPL,2018-09-30,55.582763,57.4175,53.825001,55.518421,2715888000.0
AAPL,2018-10-31,55.3,58.3675,51.522499,55.211413,3158994000.0


In [27]:
# Build a crosstab with the earthquake data between the tsunami column and the magType column. 
# Rather than showing the frequency count, show the maximum magnitude that was observed for each combination. Put the magnitude type along the columns.

In [33]:
earthquakes.head()

Unnamed: 0,mag,magType,time,place,tsunami,parsed_place
0,1.35,ml,1539475168010,"9km NE of Aguanga, CA",0,California
1,1.29,ml,1539475129610,"9km NE of Aguanga, CA",0,California
2,3.42,ml,1539475062610,"8km NE of Aguanga, CA",0,California
3,0.44,ml,1539474978070,"9km NE of Aguanga, CA",0,California
4,2.16,md,1539474716050,"10km NW of Avenal, CA",0,California


In [36]:
pd.crosstab(index=earthquakes.tsunami, columns=earthquakes.magType, values=earthquakes.mag,  aggfunc='max')


magType,mb,mb_lg,md,mh,ml,ms_20,mw,mwb,mwr,mww
tsunami,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.6,3.5,4.11,1.1,4.2,,3.83,5.8,4.8,6.0
1,6.1,,,,5.1,5.7,4.41,,,7.5


In [37]:
# Calculate the rolling 60-day aggregations of the OHLC data by ticker for the FAANG data. Use the same aggregations as exercise 3.

In [43]:
faang.groupby('ticker').rolling('60D').agg({'open': np.mean, 'high': np.max, 'low': np.min, 'close': np.mean, 'volume': np.sum})

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2018-01-02,42.540001,43.075001,42.314999,43.064999,102223600.0
AAPL,2018-01-03,42.836250,43.637501,42.314999,43.061249,220295200.0
AAPL,2018-01-04,42.935833,43.637501,42.314999,43.126666,310033600.0
AAPL,2018-01-05,43.041875,43.842499,42.314999,43.282499,404673600.0
AAPL,2018-01-08,43.151000,43.902500,42.314999,43.343500,486944800.0
...,...,...,...,...,...,...
NFLX,2018-12-24,283.509251,332.049988,233.679993,281.931750,525657600.0
NFLX,2018-12-26,281.844501,332.049988,231.229996,280.777750,520444300.0
NFLX,2018-12-27,281.070489,332.049988,231.229996,280.162927,532679500.0
NFLX,2018-12-28,279.916342,332.049988,231.229996,279.461464,521973500.0


In [44]:
# Create a pivot table of the faang data that compares the stocks. Put the ticker in the rows and show the averages of the OHLC and volume traded data.

In [50]:
faang.pivot_table(index='ticker', values={'open':np.mean, 'high':np.max, 'low':np.min, 'close':np.mean,'volume':np.sum})

Unnamed: 0_level_0,close,high,low,open,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL,47.263357,47.748526,46.795877,47.277859,136080300.0
AMZN,1641.726176,1662.839839,1619.840519,1644.072709,5648994.0
GOOG,1113.225134,1125.777606,1101.001658,1113.554101,1741965.0
NFLX,319.290319,325.219322,313.18733,319.620558,11469620.0


In [52]:
faang.pivot_table(index='ticker')

Unnamed: 0_level_0,close,high,low,open,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL,47.263357,47.748526,46.795877,47.277859,136080300.0
AMZN,1641.726176,1662.839839,1619.840519,1644.072709,5648994.0
GOOG,1113.225134,1125.777606,1101.001658,1113.554101,1741965.0
NFLX,319.290319,325.219322,313.18733,319.620558,11469620.0


In [53]:
# Calculate the Z-scores of Amazon data using apply()

In [58]:
faang.query('ticker == "AMZN"').drop(columns='ticker').apply(lambda x: x.sub(x.mean()).div(x.std())).head()

Unnamed: 0_level_0,high,low,open,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,-2.392713,-2.256607,-2.368567,-2.294847,-1.157639
2018-01-03,-2.314329,-2.167263,-2.286783,-2.217848,-0.995307
2018-01-04,-2.261803,-2.085101,-2.202993,-2.190526,-1.029278
2018-01-05,-2.194652,-2.058283,-2.140226,-2.091426,-0.824511
2018-01-08,-2.073509,-1.947644,-2.047455,-2.001551,-0.5366


In [59]:
# 1. Create a dataframe with three columns: ticker, date and event
#   A. ticker will be 'FB'
#   B. date will be date times ['2018-07-25', '2018-03-19', '2018-03-20']
#   C. event will be ['Dissappointing user growth announced after close.', 'Cambridge Analytica story', 'FTC investigation']
# 2. Set the index to ['date', 'ticker']
# 3. Merge this data to the FAANG data with outer join

In [67]:
events = pd.DataFrame({
  'ticker' : 'FB',
  'date' : pd.to_datetime(['2018-07-25', '2018-03-19', '2018-03-20']),
  'event' : ['Dissappointing user growth announced after close.', 'Cambridge Analytica story', 'FTC investigation']}
).set_index(['date', 'ticker'])

events

Unnamed: 0_level_0,Unnamed: 1_level_0,event
date,ticker,Unnamed: 2_level_1
2018-07-25,FB,Dissappointing user growth announced after close.
2018-03-19,FB,Cambridge Analytica story
2018-03-20,FB,FTC investigation


In [69]:
faang.head()

Unnamed: 0_level_0,high,low,open,close,volume,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,43.075001,42.314999,42.540001,43.064999,102223600.0,AAPL
2018-01-03,43.637501,42.990002,43.1325,43.057499,118071600.0,AAPL
2018-01-04,43.3675,43.02,43.134998,43.2575,89738400.0,AAPL
2018-01-05,43.842499,43.262501,43.360001,43.75,94640000.0,AAPL
2018-01-08,43.9025,43.482498,43.587502,43.587502,82271200.0,AAPL


In [74]:
faang_merged = faang.merge(events, left_on='date', right_on='date', how='outer')
faang_merged.head()

Unnamed: 0_level_0,high,low,open,close,volume,ticker,event
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-02,43.075001,42.314999,42.540001,43.064999,102223600.0,AAPL,
2018-01-02,1190.0,1170.51001,1172.0,1189.01001,2694500.0,AMZN,
2018-01-02,201.649994,195.419998,196.100006,201.070007,10966900.0,NFLX,
2018-01-02,1066.939941,1045.22998,1048.339966,1065.0,1237600.0,GOOG,
2018-01-03,43.637501,42.990002,43.1325,43.057499,118071600.0,AAPL,


In [79]:
faang.reset_index().set_index(['date', 'ticker']).join(events, how='outer')

Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume,event
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-02,AAPL,43.075001,42.314999,42.540001,43.064999,102223600.0,
2018-01-02,AMZN,1190.000000,1170.510010,1172.000000,1189.010010,2694500.0,
2018-01-02,GOOG,1066.939941,1045.229980,1048.339966,1065.000000,1237600.0,
2018-01-02,NFLX,201.649994,195.419998,196.100006,201.070007,10966900.0,
2018-01-03,AAPL,43.637501,42.990002,43.132500,43.057499,118071600.0,
...,...,...,...,...,...,...,...
2018-12-28,NFLX,261.910004,249.800003,257.940002,256.079987,10992800.0,
2018-12-31,AAPL,39.840000,39.119999,39.632500,39.435001,140014000.0,
2018-12-31,AMZN,1520.760010,1487.000000,1510.800049,1501.969971,6954500.0,
2018-12-31,GOOG,1052.699951,1023.590027,1050.959961,1035.609985,1493300.0,


In [80]:
# Use the transform() method on the faang data to represent all the values in terms of the first date in the data. 
# To do so, divide all values for each ticker by the values of the first date in the data for that ticker. 

In [81]:
faang = faang.reset_index().set_index(['ticker', 'date'])
faang_index = (faang / faang.groupby(level='ticker').transform('first'))

faang_index.groupby(level='ticker').agg('head', 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,high,low,open,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2018-01-02,1.0,1.0,1.0,1.0,1.0
AAPL,2018-01-03,1.013059,1.015952,1.013928,0.999826,1.155033
AAPL,2018-01-04,1.00679,1.016661,1.013987,1.00447,0.877864
AMZN,2018-01-02,1.0,1.0,1.0,1.0,1.0
AMZN,2018-01-03,1.013017,1.015199,1.013908,1.012775,1.153758
AMZN,2018-01-04,1.021739,1.029175,1.028157,1.017308,1.121581
NFLX,2018-01-02,1.0,1.0,1.0,1.0,1.0
NFLX,2018-01-03,1.022614,1.031112,1.030342,1.019794,0.783394
NFLX,2018-01-04,1.026779,1.043905,1.051504,1.022679,0.5498
GOOG,2018-01-02,1.0,1.0,1.0,1.0,1.0


In [82]:
# 1. Read in data from covid19_cases.csv file
# 2. Create a date column by parsing the dateRep column into a datetime
# 3. Set the date column as the index
# 4. Use the replace() method to update all occurrences of United_States_America and United Kingdom to USA and UK
# 5. Sort the index

In [83]:
covid19 = pd.read_csv('covid19_cases.csv')
covid19.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,01/01/2020,1,1,2020,0,0,Lithuania,LT,LTU,2794184.0,Europe,
1,01/01/2020,1,1,2020,0,0,Iceland,IS,ISL,356991.0,Europe,
2,01/01/2020,1,1,2020,0,0,Nepal,NP,NPL,28608715.0,Asia,
3,01/01/2020,1,1,2020,0,0,San_Marino,SM,SMR,34453.0,Europe,
4,01/01/2020,1,1,2020,0,0,Canada,CA,CAN,37411038.0,America,


In [85]:
covid19['date'] = pd.to_datetime(covid19.dateRep)
covid19.set_index('date', inplace=True)

In [97]:
covid19['countriesAndTerritories'] = covid19['countriesAndTerritories'].str.replace('United_States_of_America', 'USA').str.replace('United Kingdom', 'UK')

In [98]:
covid19.sort_index()

Unnamed: 0_level_0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-01,01/01/2020,1,1,2020,0,0,Lithuania,LT,LTU,2794184.0,Europe,
2020-01-01,01/01/2020,1,1,2020,0,0,Singapore,SG,SGP,5804343.0,Asia,
2020-01-01,01/01/2020,1,1,2020,0,0,Egypt,EG,EGY,100388076.0,Africa,
2020-01-01,01/01/2020,1,1,2020,0,0,Azerbaijan,AZ,AZE,10047719.0,Europe,
2020-01-01,01/01/2020,1,1,2020,0,0,Switzerland,CH,CHE,8544527.0,Europe,
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-09,12/09/2020,12,9,2020,1526,87,Ecuador,EC,ECU,17373657.0,America,14.913383
2020-12-09,12/09/2020,12,9,2020,5,0,Thailand,TH,THA,69625581.0,Asia,0.078994
2020-12-09,12/09/2020,12,9,2020,571,7,Uzbekistan,UZ,UZB,32981715.0,Asia,16.812346
2020-12-09,12/09/2020,12,9,2020,96,2,French_Polynesia,PF,PYF,279285.0,Oceania,168.644933


In [99]:
covid=pd.read_csv('covid19_cases.csv')\
  .assign(date=lambda x: pd.to_datetime(x.dateRep, format='%d/%m/%Y'))\
    .set_index('date')\
      .replace('United_States_Of_America', 'USA')\
        .replace('United_Kingdom', 'UK')\
          .sort_index()

covid.head()

Unnamed: 0_level_0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-01,01/01/2020,1,1,2020,0,0,Lithuania,LT,LTU,2794184.0,Europe,
2020-01-01,01/01/2020,1,1,2020,0,0,Iceland,IS,ISL,356991.0,Europe,
2020-01-01,01/01/2020,1,1,2020,0,0,Nepal,NP,NPL,28608715.0,Asia,
2020-01-01,01/01/2020,1,1,2020,0,0,San_Marino,SM,SMR,34453.0,Europe,
2020-01-01,01/01/2020,1,1,2020,0,0,Canada,CA,CAN,37411038.0,America,


In [105]:
# For the 5 countries with the most cases (cumulative), find the day with the largest number of cases


In [106]:
top_five_countries = covid.groupby('countriesAndTerritories').cases.sum().nlargest(5).index
covid[covid.countriesAndTerritories.isin(top_five_countries)].groupby('countriesAndTerritories').cases.idxmax()

countriesAndTerritories
Brazil                     2020-07-30
India                      2020-09-17
Peru                       2020-08-17
Russia                     2020-07-18
United_States_of_America   2020-07-25
Name: cases, dtype: datetime64[ns]

In [104]:
# Find the 7-day average change in COVID19 cases for the last week in the data for the coutries found in part 2

In [107]:
covid.groupby(['countriesAndTerritories', pd.Grouper(freq='1D')]).cases.sum().unstack(0).diff().rolling(7).mean().last('1W')[top_five_countries]

countriesAndTerritories,United_States_of_America,India,Brazil,Russia,Peru
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-14,473.714286,181.285714,35.285714,36.285714,73.142857
2020-09-15,1513.0,1142.857143,697.428571,46.285714,377.571429
2020-09-16,3478.714286,59.571429,3196.285714,61.428571,-65.0
2020-09-17,-1047.0,308.428571,143.428571,810.0,-29.428571
2020-09-18,865.714286,-18.142857,-607.714286,-688.428571,-227.571429


In [108]:
# Find the first date that each country other than China had cases
covid.reset_index()\
  .pivot(index='date', columns='countriesAndTerritories', values='cases')\
    .drop(columns='China')\
      .fillna(0)\
        .apply(lambda x: x[(x > 0)].idxmin())\
          .sort_values()\
            .rename(lambda x: x.replace('_', ' '))

countriesAndTerritories
Thailand                   2020-01-13
Japan                      2020-01-15
South Korea                2020-01-20
United States of America   2020-01-21
Taiwan                     2020-01-21
                              ...    
Lesotho                    2020-05-15
Uruguay                    2020-05-17
Western Sahara             2020-06-20
Mali                       2020-07-07
Puerto Rico                2020-09-10
Length: 209, dtype: datetime64[ns]

In [109]:
# Rank the countries by total cases using percentiles

In [110]:
covid.pivot_table(columns='countriesAndTerritories', values='cases', aggfunc='sum')\
  .T\
    .transform('rank', method='max', pct=True)\
      .sort_values('cases', ascending=False)\
        .rename(lambda x: x.replace('_', ' '))

Unnamed: 0_level_0,cases
countriesAndTerritories,Unnamed: 1_level_1
United States of America,1.000000
India,0.995238
Brazil,0.990476
Russia,0.985714
Peru,0.980952
...,...
Greenland,0.023810
Montserrat,0.019048
Falkland Islands (Malvinas),0.019048
Holy See,0.009524
