# Python Data Analyis

## Import all necessary modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Series

In [3]:
np.random.randn(5)

array([-1.32885024, -0.22429626,  0.92240959, -0.33842857,  0.21741054])

In [4]:
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)
s

a   -0.093193
b    0.190662
c    0.676822
d    0.458695
e    0.939050
dtype: float64

In [5]:
'b' in s

True

In [6]:
s['b']

0.1906624883185567

In [7]:
s.to_dict()

{'a': -0.093193314533478372,
 'b': 0.1906624883185567,
 'c': 0.6768224476260305,
 'd': 0.45869495939229787,
 'e': 0.93905006231997479}

In [8]:
s = pd.Series(s.to_dict(), index=['b', 'e', 'a', 'd', 'f'])
s

b    0.190662
e    0.939050
a   -0.093193
d    0.458695
f         NaN
dtype: float64

In [9]:
s.dropna()

b    0.190662
e    0.939050
a   -0.093193
d    0.458695
dtype: float64

In [10]:
s * 2

b    0.381325
e    1.878100
a   -0.186387
d    0.917390
f         NaN
dtype: float64

In [11]:
a_1 = list(range(100))

In [12]:
a_1[:12:2]

[0, 2, 4, 6, 8, 10]

In [13]:
s[3:]

d    0.458695
f         NaN
dtype: float64

In [None]:
s[:3]

In [None]:
s.index

## DataFrame: 2D collection of Series

In [None]:
df = pd.DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)})
df

In [None]:
df['d'] = range(6)

In [None]:
df

In [None]:
df[(df['c'] > 0) & (df['a'] > 0)]

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df['b']    # by column

In [None]:
df[:3]

In [None]:
df[:-2]

In [None]:
df[-2:]   # the last 2 rows

In [None]:
df[['a','c']]

In [None]:
df.loc[2]    # does equal thing as above. In this case, 3rd row. 

In [None]:
df.loc[2, 'b']

In [None]:
df.loc[2:4, 'b']

In [None]:
df.loc[2:4, ['b', 'c']]

In [None]:
df.iloc[2:4, 1:2]   # slices by column - essentially the same as above.

In [None]:
df.loc[[0, 2, 4], ['b', 'c', 'd']]   # pass a list of rows and columns I want to select out

In [None]:
df[df['c'] > 0]

In [None]:
df.loc[df['c'] > 0]   # boolean arrays

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.T

In [None]:
df.T.T

In [None]:
df['a'].dot(df['a'])

## Working with real dataset 

In [None]:
pd.read_csv?

In [None]:
data = pd.read_csv("store_sales.txt")

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['Sales'].sum()

In [None]:
data[(data['Province'] == 'Yukon')]

In [None]:
data[['Province', 'Product Category']]

In [None]:
data[['Province', 'Product Category']].drop_duplicates()

In [None]:
data.groupby(['Province', 'Product Category'])

In [None]:
data.groupby(['Province', 'Product Category']).groups

In [None]:
data.groupby(['Province', 'Product Category']).groups.keys()

In [None]:
len(data.groupby(['Province', 'Product Category']).groups[('Alberta', 'Furniture')])

In [None]:
data.groupby(['Province', 'Product Category']).agg({'Sales':'sum'})

In [None]:
data.groupby(['Province', 'Product Category']).agg({'Sales':['sum', 'mean']})

In [None]:
data.groupby(['Province', 'Product Category']).agg({'Sales':['sum', 'mean', lambda x: x.max()]})

In [None]:
def get_range(x):
    return x.max() - x.min()
data.groupby(['Province', 'Product Category']).agg({'Sales':['sum', 'mean', get_range]})

In [None]:
g1 = data.groupby(['Province', 'Product Category'])

def cust_func(group):
    filtered_sales = group[group['Sales'] > 10]
    return filtered_sales['Sales']
g1.apply(cust_func)


In [None]:
pi_data = data.pivot_table(
    index=['Province','Customer Segment'], 
    columns=['Product Category'], 
    values=['Sales'], 
    aggfunc=[np.sum, np.mean])
pi_data

In [None]:
pi_data.columns

In [None]:
pi_data.to_csv("/Users/ramanathanhari/pi_data_1.csv")

## Time Series analysis

In [None]:
sd = pd.read_csv("stock_data.csv")
sd

In [None]:
sd.info()

In [None]:
sd.describe()

In [None]:
sd.cov()

In [None]:
sd.corr()

In [None]:
plt.matshow(sd.corr())

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(10, 8))
corr = sd.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
sd['AAPL'].rolling(5).mean()

In [None]:
sd['AAPL'].rolling(5).apply(get_range)

In [None]:
sd_apple = sd[['AAPL']].copy()
sd_apple

In [None]:
sd_apple['returns'] = np.log(sd_apple['AAPL'] / sd_apple['AAPL'].shift(1))
sd_apple

In [None]:
sd_apple['SMA_1'] = sd_apple['AAPL'].rolling(60).mean()
sd_apple

In [None]:
sd_apple['SMA_2'] = sd_apple['AAPL'].rolling(10).mean()
sd_apple

In [None]:
sd_apple[['AAPL','SMA_1', 'SMA_2']].plot(figsize=(10,6))

In [None]:
sd1 = pd.read_csv("stock_data.csv", index_col=0, parse_dates=True)



In [None]:
sd1.info()

In [None]:
sd.info()