# Pandas Tutorial (One Shot)
---

In [1]:
import pandas as pd
import numpy as np

>#### Creating Series

Can create a series using python list, numpy arrays or dictionaries. A range of mathematical can also be applied to modify the series

In [3]:
# series by python list
list_1 = ['a','b','c','d']
list_1_labels = [1,2,3,4]
series_1 = pd.Series(data=list_1, index=list_1_labels)

# series by numpy array
np_arr_1 = np.array([1,2,3,4])
series_2 = pd.Series(np_arr_1)

# series by dict
dict_1 = {'name':'ath', 'age':21, 'nationality':'indian'}
series_3 = pd.Series(dict_1, name='ath details')
series_3.name # returns the series name defined

# mathematical operations
series_2 + series_2
series_2 * series_2
np.exp(series_2)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

>#### Creating Dataframes

*`pd.dataframe()` parameters...*
- values
- row_labels
- column_labels

In [4]:
# array -> dataframe

# generate random numbers between 10 and 50 into a array with 3 rows and 3 cols
np_arr_2 = np.random.randint(10,50,size=(3,3))
df_1 = pd.DataFrame(np_arr_2,['Row1', 'Row2', 'Row3'], ['Col1', 'Col2', 'Col3'])
df_1

# dictionary -> dataframe
dict_2 = { 'col1': pd.Series([1,2,3], index=['row1', 'row2', 'row3']),
            'col2': pd.Series([1.,2.,3.,4.], index=['row1', 'row2', 'row3', 'row4'])
} 
df_2 = pd.DataFrame(dict_2)
df_2

Unnamed: 0,col1,col2
row1,1.0,1.0
row2,2.0,2.0
row3,3.0,3.0
row4,,4.0


>#### Editing and Retrieving Data

- Obtain columns using...
    - `dataframe.column_name` OR `dataframe['column_name']`

- Obtain rows and/or columns using...
    - `dataframe.loc[row_label,column_label]` (can also supply a range of values/index ranges; start and end labels are inclusive)

    - `dataframe.iloc[row_index,column_index]` (can also supply a range of values/index ranges; start indexs are inclusive, end labels are exclusive)

- Add new columns using `dataframe[<new_column>] = values` and new rows using `dataframe = dataframe.append(<new_row>)`
    - Note: If you want the row to be indexed using 0,1,2,3... add `ignore_index=True` in the `dataframe.append()` function
    - If you want to name the row manually, specify `name=<new_row_name>` in the `pd.Series()` argument

- Delete columns/rows by specifying the row/column name, axis (`axis=0`for deleting rows; `axis=1` for deleting columns) and `inplace=True` in the `dataframe.drop()` argument

In [5]:
# get the first column
df_1['Col1']
df_1.loc[:,'Col1']
df_1.iloc[:,0]

# get the first row
df_1.loc['Row1',:]
df_1.iloc[0,:]

# get multiple columns
df_1[['Col1','Col2']]
df_1.loc[:,['Col1', 'Col2']]
df_1.iloc[:,0:2]

# get rows & columns (by label)
df_1.loc['Row1':'Row3', 'Col1':'Col2']
df_1.loc[['Row1','Row3'], ['Col1','Col3']]

# get rows & columns (by index)
df_1.iloc[0:3, 0:3]
df_1.iloc[[0,2], [0,2]]

df_1.loc['Row1', 'Col2'] # returns a particular value

37

In [6]:
# adding a new column
df_1['Col_Total'] = df_1['Col1'] + df_1['Col2'] + df_1['Col3']
df_1

# adding a new row
new_row = pd.Series(df_1.loc['Row1'] + df_1.loc['Row2'] + df_1.loc['Row3'], name='Row_Total')
df_1 = df_1.append(new_row)
df_1

Unnamed: 0,Col1,Col2,Col3,Col_Total
Row1,22,37,49,108
Row2,29,31,20,80
Row3,36,31,39,106
Row_Total,87,99,108,294


In [7]:
# deleting columns
df_1.drop('Col_Total', axis=1, inplace=True)

# deleting rows
df_1.drop('Row_Total', axis=0, inplace=True)

In [8]:
# set index
df_1['Row_names'] = ['Row1','Row2','Row3']
df_1.set_index('Row_names', inplace=True)

# reset index
df_1.reset_index(inplace=True)
df_1.drop('Row_names', axis=1, inplace=True)

>#### Conditional Selection

In [9]:
np_arr_3 = np.random.randint(1,10, size=(3,3))
df_3 = pd.DataFrame(np_arr_3, index=None, columns=['Col1','Col2','Col3'])
df_3

Unnamed: 0,Col1,Col2,Col3
0,1,8,8
1,4,9,3
2,7,5,4


In [10]:
# prints the dataframe where the values in first col satisfy the conditional 
df_3[df_3['Col1'] <= 5]

Unnamed: 0,Col1,Col2,Col3
0,1,8,8
1,4,9,3


>#### File Input/Output

- Pandas can work with...
    - .csv files: 
        - Read = `pd.read_csv(filename)`
        - Write = `dataframe.to_csv(filename)`
    - .xlsx files: 
        - Read = `pd.read_excel(filename)`
        - Write = `dataframe.to_excel(filename)`

    - Databases:
        ```
        import pymysql
        
        try:
            db_connection = pymysql.connect(db = database_name, user = user_name, passwd = password_name, host = 'localhost', port = port_num <check port num>)
            dataframe = pd.read_sql('SELECT * FROM database_name', con=db_connection)
            print(database_name)

        except Exception as e:
            print('Exception {}'.format(e))

        finally:
            db_connection.close()
        ```

>#### Dataframe basics

- Use `dataframe['column_name'].map()` to transform/modify a series and `dataframe.applymap()` or `dataframe.transform()` transform/modify a dataframe. [Remember to specify the `lambda x: x...` function within the respective brackets]

In [52]:
weather_df = pd.read_csv('nyc_weather_data.csv')

In [233]:
weather_df.head() # prints first 5 rows
weather_df.tail() # prints last 5 rows

# convert to numpy array
weather_df.to_numpy()

# get all row indices as an array
weather_df.index.array 

# add 1 to all values in a dataframe
df_3.transform(lambda x: x+1)

# add 2 to all values within 1st column 
# and subtract 1 from all vlues in second column
df_3.transform({'Col1': lambda x: x+2,
                'Col2': lambda x: x-1,
                'Col3': lambda x: x})

Unnamed: 0,Col1,Col2,Col3
0,5,4,3
1,7,3,6
2,10,2,4


In [23]:
# array of unique values in a particular col/row
df_3.loc[:,'Col1'].unique()
df_3.loc[0].unique()

# number of unique values in a col/row
df_3.loc[:,'Col1'].nunique()
df_3.loc[0].nunique()

# frequency of a value in series (i.e. number of times a value showed up)
df_3.loc[:,'Col1'].value_counts()
df_3.loc[0].value_counts()

# get all the column names in a dataframe
df_3.columns

# get the index of the dataframe
df_3.index

Unnamed: 0,Col1,Col2,Col3
0,,,
1,,,
2,,,


>#### Grouping Data

In [24]:
dict_3 = {
    'Store': [1, 2, 1, 2],
    'Flavor': ['Choc','Van','Straw','Choc'],
    'Sales': [26, 12, 18, 22]
} 

df_4 = pd.DataFrame(dict_3)

Unnamed: 0,Store,Flavor,Sales
0,1,Choc,26
1,2,Van,12
2,1,Straw,18
3,2,Choc,22


In [36]:
# group by a column name + use an aggregate func

# group by average sales in store
by_store_avg_sales = df_4.groupby('Store').mean()
by_store_avg_sales

# group by total sales by store, print total sales in store 2
by_store_sum_sales = df_4.groupby('Store').sum()
by_store_sum_sales.loc[2]

# give all the basic stats about each store
by_store_all_stats = df_4.groupby('Store').describe()
by_store_all_stats

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Store,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,2.0,22.0,5.656854,18.0,20.0,22.0,24.0,26.0
2,2.0,17.0,7.071068,12.0,14.5,17.0,19.5,22.0


>#### Statistics

- Kurtosis (`.kurt()`) gives an idea of the number of outliers in the data...
    - value < 3: few outliers
    - value = 3: normal distribution
    - value > 3: many outliers

In [72]:
# counts all the values in a dataframe, excluding NaN
weather_df.count()

# gives total of all the values in a column, whilst ignoring NaN values
weather_df['Temperature'].sum(skipna=True)

# summary of stats, of a particular col/dataframe
weather_df['Temperature'].describe()

# product of all values in a column
weather_df['Temperature'].product()

# standard error of all numerical data
weather_df.sem()

# skewness of values of all numerical data
weather_df.skew()

# kurtosis of values of all numerical data
weather_df.kurt()

# cumulative sum of a particular column
weather_df['Temperature'].cumsum()

0       38
1       74
2      114
3      139
4      159
5      192
6      231
7      270
8      314
9      364
10     397
11     432
12     458
13     488
14     531
15     578
16     614
17     639
18     661
19     693
20     724
21     750
22     776
23     804
24     838
25     881
26     922
27     959
28     995
29    1029
30    1075
Name: Temperature, dtype: int64

>#### Iterating through a Series and DataFrame

In [75]:
series_4 = pd.Series(range(5), index=['a','b','c','d','e'])

np_arr_4 = np.random.randint(10, 50, size=(3,3))
df_8 = pd.DataFrame(np_arr_4, [1,2,3],['A','B','C'])

In [89]:
# iterating through series
for value in series_4:
    print(value)

# iterating through columns of a dataframe
for col,row_indexed_values in df_8.items():
    print(col)
    print(row_indexed_values)

# iterating through rows of a dataframe
for row,col_indexed_values in df_8.iterrows():
    print(row)
    print(col_indexed_values)

>#### Sorting

In [96]:
print(df_8)

# sorts by descending order of indexes
df_8.sort_index(ascending=False)

# sorts by ascending order of values of a particular column
df_8.sort_values(by='B', ascending=True)

    A   B   C
1  26  40  41
2  35  30  39
3  49  19  21


Unnamed: 0,A,B,C
3,49,19,21
2,35,30,39
1,26,40,41


>##### Handling Missing Data

In [98]:
dict_4 = {
    'A': [1., 2., np.nan],
    'B': [4., np.nan, np.nan],
    'C': [7., 8., 9.]
}

df_9 = pd.DataFrame(dict_4)

In [114]:
# drop rows/cols containing missing values
df_9.dropna(axis=0)
df_9.dropna(axis=1)

# drop rows/cols containing less than 2 non-NaN values
df_9.dropna(thresh=2,axis=0)
df_9.dropna(thresh=2,axis=1)

# fill all NaN values by a particular value
df_9.fillna(value = 0.0)
df_9.fillna(value = df_9['A'].mean())

# fill NaN values by the previous value
df_9.fillna(method='ffill')

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,4.0,8.0
2,2.0,4.0,9.0
