# Pandas

### pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language

### Pandas provides two very useful data structures to process the data i.e. Series and DataFrame

In [2]:
import pandas as pd

### The Series is a one-dimensional array that can store various data types, including mix data types. The row labels in a Series are called the index.

In [3]:
f = ['FB', '2001-08-02', 90, 3.2]
f = pd.Series(f, index = ['name', 'date', 'shares', 'price'])
f

name              FB
date      2001-08-02
shares            90
price            3.2
dtype: object

### Series are used to work with one dimensional array, whereas DataFrame can be used with two dimensional arrays.

In [4]:
data = { 'name' : ['AA', 'IBM', 'GOOG'],
       'date' : ['2001-12-01', '2012-02-10', '2010-04-09'],
       'shares' : [100, 30, 90],
       'price' : [12.3, 10.3, 32.2]
 }
df = pd.DataFrame(data)
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name,date,shares,price
0,AA,2001-12-01,100,12.3
1,IBM,2012-02-10,30,10.3
2,GOOG,2010-04-09,90,32.2


###### Data can be accessed in two ways i.e. using row and column index

In [5]:
 df['shares']

0    100
1     30
2     90
Name: shares, dtype: int64

In [6]:
df.index = ['one', 'two', 'three']

In [8]:
df.iloc[0]

name              AA
date      2001-12-01
shares           100
price           12.3
Name: one, dtype: object

In [9]:
df.loc['one']

name              AA
date      2001-12-01
shares           100
price           12.3
Name: one, dtype: object

In [20]:
data = pd.read_csv('StudentsPerformance.csv')
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [36]:
data.info

<bound method DataFrame.info of      gender race/ethnicity parental level of education         lunch  \
0    female        group B           bachelor's degree      standard   
1    female        group C                some college      standard   
2    female        group B             master's degree      standard   
3      male        group A          associate's degree  free/reduced   
4      male        group C                some college      standard   
..      ...            ...                         ...           ...   
995  female        group E             master's degree      standard   
996    male        group C                 high school  free/reduced   
997  female        group C                 high school  free/reduced   
998  female        group D                some college      standard   
999  female        group D                some college  free/reduced   

    test preparation course  math score  reading score  writing score  
0                      none    

In [16]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [17]:
data.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [18]:
data.head(8)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39


### Filtering data

In [26]:
math_A = data[data['math score']>80]
math_A

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
2,female,group B,master's degree,standard,none,90,95,93
6,female,group B,some college,standard,completed,88,95,92
16,male,group C,high school,standard,none,88,89,86
34,male,group E,some college,standard,none,97,87,82
35,male,group E,associate's degree,standard,completed,81,81,79
...,...,...,...,...,...,...,...,...
979,female,group C,associate's degree,standard,none,91,95,94
981,male,group D,some high school,standard,none,81,78,78
987,male,group E,some high school,standard,completed,81,75,76
990,male,group E,high school,free/reduced,completed,86,81,75


In [23]:
math_Am = data[(data['math score']>80) & (data['gender']=='male')]
math_Am

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
16,male,group C,high school,standard,none,88,89,86
34,male,group E,some college,standard,none,97,87,82
35,male,group E,associate's degree,standard,completed,81,81,79
49,male,group C,high school,standard,completed,82,84,82
53,male,group D,high school,standard,none,88,78,75
...,...,...,...,...,...,...,...,...
950,male,group E,high school,standard,none,94,73,71
956,male,group C,some college,standard,none,84,87,81
981,male,group D,some high school,standard,none,81,78,78
987,male,group E,some high school,standard,completed,81,75,76


In [25]:
class_data = data[(data['race/ethnicity']== 'group D') | (data['parental level of education']=='high school')].sort
class_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50
11,male,group D,associate's degree,standard,none,40,52,43
12,female,group B,high school,standard,none,65,81,73
16,male,group C,high school,standard,none,88,89,86
...,...,...,...,...,...,...,...,...
994,male,group A,high school,standard,none,63,63,62
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


#### these data can be sorted using:
##### *sort_index* to sort based on index  
##### *sort_values()* to sort with respect to a particular values of an attribute   

In [29]:
#NULL values
c = pd.read_csv('cast.csv')
c

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,31.0
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,
...,...,...,...,...,...,...
74996,Mia fora kai ena... moro,2011,Penelope Anastasopoulou,actress,Popi voulkanizater,11.0
74997,The Magician King,2004,Tiannah Anastassiades,actress,Unicycle Race Attendant,
74998,Festival of Lights,2010,Zoe Anastassiou,actress,Guidance Counselor,20.0
74999,Toxic Tutu,2016,Zoe Anastassiou,actress,Demon of Toxicity,


In [30]:
c['n'].isnull().head()

0    False
1    False
2    False
3     True
4     True
Name: n, dtype: bool

In [31]:
c['n'].notnull().head()

0     True
1     True
2     True
3    False
4    False
Name: n, dtype: bool

In [33]:
data.value_counts('parental level of education')

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
dtype: int64

In [34]:
cf = c[c['name'] == 'Aaron Abrams']
cf.groupby(['year']).size().head()

year
2003    2
2004    2
2005    2
2006    1
2007    2
dtype: int64

In [35]:
cf.groupby(['year', 'title']).size().head()

year  title                               
2003  The In-Laws                             1
      The Visual Bible: The Gospel of John    1
2004  Resident Evil: Apocalypse               1
      Siblings                                1
2005  Cinderella Man                          1
dtype: int64

In [37]:
c.groupby(['year']).n.max()

year
1912      6.0
1913     14.0
1914     39.0
1915     14.0
1916     35.0
        ...  
2017    620.0
2018     21.0
2019      6.0
2020      NaN
2023      NaN
Name: n, Length: 110, dtype: float64

In [38]:
c.groupby(['year']).n.min()

year
1912    6.0
1913    1.0
1914    1.0
1915    1.0
1916    1.0
       ... 
2017    1.0
2018    2.0
2019    6.0
2020    NaN
2023    NaN
Name: n, Length: 110, dtype: float64

In [39]:
c.groupby(['year']).n.mean()

year
1912     6.000000
1913     4.142857
1914     7.085106
1915     4.236111
1916     5.037736
          ...    
2017    14.566667
2018     7.600000
2019     6.000000
2020          NaN
2023          NaN
Name: n, Length: 110, dtype: float64

In [41]:
c_decade = c.groupby( ['type', c['year']//10*10] ).size()
c_decade.unstack()

year,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010,2020
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
actor,384,710,2628,3014,2877,2775,3044,3565,5108,10368,15523,4
actress,285,411,820,983,1015,968,1299,1989,2544,5831,8853,3


In [42]:
data.drop('lunch',axis=1)

Unnamed: 0,gender,race/ethnicity,parental level of education,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,none,72,72,74
1,female,group C,some college,completed,69,90,88
2,female,group B,master's degree,none,90,95,93
3,male,group A,associate's degree,none,47,57,44
4,male,group C,some college,none,76,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,completed,88,99,95
996,male,group C,high school,none,62,55,55
997,female,group C,high school,completed,59,71,65
998,female,group D,some college,completed,68,78,77


#### fillna(): fill missing values in a DataFrame
#### merge(): merge two or more DataFrames based on a common column