# Data Analysis - Introduction to Pandas

**Author**: [Gabriele Pompa](https://www.linkedin.com/in/gabrielepompa/): gabriele.pompa@unisi.com

# Table of contents

[Executive Summary](#summary)

**TODO**

### **Resources**: 

**TODO**

# Executive Summary <a name="summary"></a>

**TODO**

These are the basic imports that we need to work with NumPy, Pandas and to plot data using Matplotlib functionalities

In [2]:
# for NumPy arrays
import numpy as np

# for Pandas Series and DataFrame
import pandas as pd

# for Matplotlib plotting
import matplotlib.pyplot as plt

# to do inline plots in the Notebook
%matplotlib inline

[OS - Operating System Interfaces](https://docs.python.org/3/library/os.html)

In [8]:
# to create delete directories
import os

In [9]:
# ".." means one directory above in the directory tree
# therefore, since we are in the directory "IT_For_Business_And_Finance_2019_20/Notebooks",
# "../Data/" is equivalent to "IT_For_Business_And_Finance_2019_20/Data"

dataFolderPath = "../Data/"

if not os.path.exists(dataFolderPath):
    os.makedirs(dataFolderPath)

---

## Pickle [https://docs.python.org/3/library/pickle.html](https://docs.python.org/3/library/pickle.html)

In [None]:
rows = int(1e6)

In [None]:
mat = np.array([[i*k for i in range(1,rows+1)] for k in range(1,6)]).T

In [None]:
mat

In [None]:
mat.shape

In [None]:
mat.dtype

[open() function](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files)

In [None]:
import pickle

[with statement](https://www.geeksforgeeks.org/with-statement-in-python/)

In [None]:
with open(dataFolderPath + "mat.pkl", 'wb') as file:
    %time pickle.dump(mat, file)

In [None]:
type(file)

In [None]:
file.closed

In [None]:
with open(dataFolderPath + "mat.pkl", 'rb') as file:
    %time mat_reloaded = pickle.load(file)

In [None]:
file.closed

In [None]:
mat_reloaded

remove file [os.remove()](https://docs.python.org/3/library/os.html#os.remove)

In [None]:
if os.path.isfile(dataFolderPath + "mat.pkl"):
    os.remove(dataFolderPath + "mat.pkl")

# double-check if file still exists
os.path.isfile(dataFolderPath + "mat.pkl")

multiple objects

In [None]:
mat_dict = {'mat': mat,
            'mat_squared': mat**2}

In [None]:
mat_dict['mat']

In [None]:
mat_dict['mat_squared']

In [None]:
with open(dataFolderPath + "mat_dict.pkl", 'wb') as file:
    %time pickle.dump(mat_dict, file)

In [None]:
with open(dataFolderPath + "mat_dict.pkl", 'rb') as file:
    %time mat_dict_reloaded = pickle.load(file)

In [None]:
mat_dict_reloaded['mat']

In [None]:
mat_dict_reloaded['mat_squared']

In [None]:
if os.path.isfile(dataFolderPath + "mat_dict.pkl"):
    os.remove(dataFolderPath + "mat_dict.pkl")

# double-check if file still exists
os.path.isfile(dataFolderPath + "mat_dict.pkl")

---

---

## JSON [https://docs.python.org/3/tutorial/inputoutput.html#saving-structured-data-with-json)

In [3]:
refData = {
    'S&P Rating': ['A', 'BB', 'AA', 'CCC'],
    'Spread': [100, 300, 70, 700],
    'Country': ['USA', 'ITA', 'UK', 'ITA']
}

In [None]:
import json

In [None]:
with open(dataFolderPath + "refData.json", 'w') as file:
    %time json.dump(refData, file, indent="\t")

In [None]:
type(file)

In [None]:
file.closed

In [None]:
with open(dataFolderPath + "refData.json", 'r') as file:
    %time refData = json.load(file)

In [None]:
file.closed

In [None]:
refData

In [None]:
if os.path.isfile(dataFolderPath + "refData.json"):
    os.remove(dataFolderPath + "refData.json")

# double-check if file still exists
os.path.isfile(dataFolderPath + "refData.json")

---

In [20]:
df_refData = pd.DataFrame(data={
                             'S&P Rating': ['A', 'BB', 'AA', 'CCC'],
                             'Spread': [100, 300, 70, 700],
                             'Country': ['USA', 'ITA', 'UK', 'ITA'],
                             'Market Cap': [430.0, 45.0, 161.25, 5.00]
                            },
                       index=['Firm_1', 'Firm_2', 'Firm_3', 'Firm_4'])

df_refData

Unnamed: 0,S&P Rating,Spread,Country,Market Cap
Firm_1,A,100,USA,430.0
Firm_2,BB,300,ITA,45.0
Firm_3,AA,70,UK,161.25
Firm_4,CCC,700,ITA,5.0


## PANDAS + SQL

In [121]:
import sqlite3 as sq3

In [136]:
# create table
con = sq3.connect(dataFolderPath + "refData.db")

In [123]:
query = """CREATE TABLE refData (
                Firms TEXT NOT NULL,
                SnP_Rating TEXT,
                Spread INT,
                Country TEXT,
                Market_Cap REAL
)"""

print(query)

CREATE TABLE refData (
                Firms TEXT NOT NULL,
                SnP_Rating TEXT,
                Spread INT,
                Country TEXT,
                Market_Cap REAL
)


In [124]:
con.execute(query)

<sqlite3.Cursor at 0x1c81ef6bab0>

In [125]:
con.commit()

In [126]:
for index, row in df_refData.iterrows():
    print(index, type(row), row.dtypes)

Firm_1 <class 'pandas.core.series.Series'> object
Firm_2 <class 'pandas.core.series.Series'> object
Firm_3 <class 'pandas.core.series.Series'> object
Firm_4 <class 'pandas.core.series.Series'> object


In [127]:
for index, row in df_refData.iterrows():
    print(index, row['S&P Rating'], row['Spread'], row["Country"], row["Market Cap"])
    print(index, type(row['S&P Rating']), type(row['Spread']), row["Country"], row["Market Cap"])

Firm_1 A 100 USA 430.0
Firm_1 <class 'str'> <class 'int'> USA 430.0
Firm_2 BB 300 ITA 45.0
Firm_2 <class 'str'> <class 'int'> ITA 45.0
Firm_3 AA 70 UK 161.25
Firm_3 <class 'str'> <class 'int'> UK 161.25
Firm_4 CCC 700 ITA 5.0
Firm_4 <class 'str'> <class 'int'> ITA 5.0


In [128]:
df_refData.dtypes

S&P Rating     object
Spread          int64
Country        object
Market Cap    float64
dtype: object

In [129]:
for index, row in df_refData.iterrows():
    query = "INSERT INTO refData VALUES ('{}', '{}', {}, '{}', {})".\
    format(index, row['S&P Rating'], row['Spread'], row["Country"], row["Market Cap"])
    
    print(query)
    con.execute(query)

con.commit()

INSERT INTO refData VALUES ('Firm_1', 'A', 100, 'USA', 430.0)
INSERT INTO refData VALUES ('Firm_2', 'BB', 300, 'ITA', 45.0)
INSERT INTO refData VALUES ('Firm_3', 'AA', 70, 'UK', 161.25)
INSERT INTO refData VALUES ('Firm_4', 'CCC', 700, 'ITA', 5.0)


inspecting using [DB Browser for SQLite](https://sqlitebrowser.org/dl/)

In [130]:
query = "SELECT * FROM refData"

df_refData_reloaded = pd.read_sql(sql=query, con=con)

df_refData_reloaded

Unnamed: 0,Firms,SnP_Rating,Spread,Country,Market_Cap
0,Firm_1,A,100,USA,430.0
1,Firm_2,BB,300,ITA,45.0
2,Firm_3,AA,70,UK,161.25
3,Firm_4,CCC,700,ITA,5.0


In [131]:
query = """SELECT * FROM refData"""

df_refData_reloaded = pd.read_sql(sql=query, con=con, index_col="Firms")

df_refData_reloaded = df_refData_reloaded.rename(columns={old_col: new_col for old_col, new_col 
                                                          in zip(df_refData_reloaded.columns, df_refData.columns)})
df_refData_reloaded

Unnamed: 0_level_0,S&P Rating,Spread,Country,Market Cap
Firms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Firm_1,A,100,USA,430.0
Firm_2,BB,300,ITA,45.0
Firm_3,AA,70,UK,161.25
Firm_4,CCC,700,ITA,5.0


In [132]:
query = "SELECT * FROM refData WHERE Market_Cap > 100"

pd.read_sql(sql=query, con=con, index_col="Firms")

Unnamed: 0_level_0,SnP_Rating,Spread,Country,Market_Cap
Firms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Firm_1,A,100,USA,430.0
Firm_3,AA,70,UK,161.25


In [133]:
df_refData_reloaded["Market Cap"][df_refData_reloaded["Market Cap"] > 100] 

Firms
Firm_1    430.00
Firm_3    161.25
Name: Market Cap, dtype: float64

In [147]:
con.close()

In [137]:
df_refData.to_sql(name="refData", con=con, index_label="Firms")

In [138]:
query = "SELECT * FROM refData"

df_refData_reloaded = pd.read_sql(sql=query, con=con, index_col="Firms")

df_refData_reloaded = df_refData_reloaded.rename(columns={old_col: new_col for old_col, new_col 
                                                          in zip(df_refData_reloaded.columns, df_refData.columns)})
df_refData_reloaded

Unnamed: 0_level_0,S&P Rating,Spread,Country,Market Cap
Firms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Firm_1,A,100,USA,430.0
Firm_2,BB,300,ITA,45.0
Firm_3,AA,70,UK,161.25
Firm_4,CCC,700,ITA,5.0


### another example

In [180]:
df = pd.DataFrame(data=np.array([[i**k for i in range(1,11)] for k in range(1,6)]).T, 
                  index=pd.date_range('2020-01-01', periods=10, freq='B'), 
                  columns=['x', 'x^2', 'x^3', 'x^4', 'x^5'])
df

Unnamed: 0,x,x^2,x^3,x^4,x^5
2020-01-01,1,1,1,1,1
2020-01-02,2,4,8,16,32
2020-01-03,3,9,27,81,243
2020-01-06,4,16,64,256,1024
2020-01-07,5,25,125,625,3125
2020-01-08,6,36,216,1296,7776
2020-01-09,7,49,343,2401,16807
2020-01-10,8,64,512,4096,32768
2020-01-13,9,81,729,6561,59049
2020-01-14,10,100,1000,10000,100000


In [181]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-06',
               '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10',
               '2020-01-13', '2020-01-14'],
              dtype='datetime64[ns]', freq='B')

In [182]:
df.index[0]

Timestamp('2020-01-01 00:00:00', freq='B')

In [183]:
# create table
con = sq3.connect(dataFolderPath + "df.db")

In [184]:
df.to_sql(name="df", con=con, index_label="Dates")

In [185]:
query = "SELECT * FROM df"
df_reloaded = pd.read_sql(sql=query, con=con, index_col="Dates")
df_reloaded

Unnamed: 0_level_0,x,x^2,x^3,x^4,x^5
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00,1,1,1,1,1
2020-01-02 00:00:00,2,4,8,16,32
2020-01-03 00:00:00,3,9,27,81,243
2020-01-06 00:00:00,4,16,64,256,1024
2020-01-07 00:00:00,5,25,125,625,3125
2020-01-08 00:00:00,6,36,216,1296,7776
2020-01-09 00:00:00,7,49,343,2401,16807
2020-01-10 00:00:00,8,64,512,4096,32768
2020-01-13 00:00:00,9,81,729,6561,59049
2020-01-14 00:00:00,10,100,1000,10000,100000


In [186]:
df_reloaded.index

Index(['2020-01-01 00:00:00', '2020-01-02 00:00:00', '2020-01-03 00:00:00',
       '2020-01-06 00:00:00', '2020-01-07 00:00:00', '2020-01-08 00:00:00',
       '2020-01-09 00:00:00', '2020-01-10 00:00:00', '2020-01-13 00:00:00',
       '2020-01-14 00:00:00'],
      dtype='object', name='Dates')

In [187]:
df_reloaded.index[0]

'2020-01-01 00:00:00'

In [188]:
type(df_reloaded.index[0])

str

In [189]:
query = "SELECT * FROM df"
df_reloaded = pd.read_sql(sql=query, con=con, index_col="Dates", parse_dates="Dates")

In [190]:
df_reloaded

Unnamed: 0_level_0,x,x^2,x^3,x^4,x^5
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,1,1,1,1,1
2020-01-02,2,4,8,16,32
2020-01-03,3,9,27,81,243
2020-01-06,4,16,64,256,1024
2020-01-07,5,25,125,625,3125
2020-01-08,6,36,216,1296,7776
2020-01-09,7,49,343,2401,16807
2020-01-10,8,64,512,4096,32768
2020-01-13,9,81,729,6561,59049
2020-01-14,10,100,1000,10000,100000


In [191]:
df_reloaded.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-06',
               '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10',
               '2020-01-13', '2020-01-14'],
              dtype='datetime64[ns]', name='Dates', freq=None)

In [194]:
df_reloaded.index[0]

Timestamp('2020-01-01 00:00:00')

In [179]:
con.close()

---

## PANDAS + .csv

---

## PANDAS + Excel (FORSE)

---

---

## PANDAS + Yahoo Finance

In [None]:
# for Yahoo Finance API
import yfinance as yf

In [None]:
data = yf.download("^GSPC", period="max")

In [None]:
data.loc['2010-01-01':, 'High'].plot()

In [None]:
data.head()

In [None]:
spx = yf.Ticker("^GSPC")
spx_hist = spx.history(period="max")

In [None]:
spx_hist.tail()

In [None]:
data2 = yf.download("SPY AAPL", start="2017-01-01", end="2017-04-30", group_by = 'ticker')

In [None]:
data2.head()

In [None]:
data2['SPY']