#### Pandas officially stands for ‘Python Data Analysis Library’

Pandas is an open source Python library that allows users to explore,
manipulate and visualise data in an extremely efficient manner.

In [1]:
import pandas as pd

##### Series are like columns while Dataframes are your full blown tables in Pandas. 

In [2]:
#Creating series from list from series
l = [10,20,30,40]
series1 = pd.Series(l)

In [3]:
print(series1)

0    10
1    20
2    30
3    40
dtype: int64


In [4]:
#Creating a series from dictiinary
d = {1:'Monday', 2:'Tuesday', 3:'Wednesday'}
series2 = pd.Series(d)

In [5]:
series2

1       Monday
2      Tuesday
3    Wednesday
dtype: object

In [6]:
#Creating dataframe from list
l_df = [['James', 23,'NY'],['Jason',20,'LA'],['Renna',25,'NY']]
df1 = pd.DataFrame(l_df, columns = ['Name','Age','Location'])

In [7]:
df1

Unnamed: 0,Name,Age,Location
0,James,23,NY
1,Jason,20,LA
2,Renna,25,NY


In [8]:
train = pd.read_csv('train.csv')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [10]:
# nunique - Return Series with number of distinct observations over requested axis.
train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [11]:
#selection

In [12]:
train['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [13]:
train[['Name','Age']].tail()

Unnamed: 0,Name,Age
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0
890,"Dooley, Mr. Patrick",32.0


In [14]:
train.iloc[1:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
#Conditional selection

In [16]:
train[(train['Pclass'] == 1) & (train['Age'] > 25)].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S


In [17]:
#GroupBy is used to split the data into groups.

In [18]:
train.groupby(['Age','Pclass']).mean()[1:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,SibSp,Parch,Fare
Age,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.67,2,756.0,1.0,1.0,1.0,14.5
0.75,3,557.5,1.0,2.0,1.0,19.2583
0.83,2,455.5,1.0,0.5,1.5,23.875
0.92,1,306.0,1.0,1.0,2.0,151.55


In [19]:
train.groupby('Embarked')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000295106C37B8>

In [20]:
#Aggregate functions

In [21]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [22]:
train['SibSp'].max()

8

In [23]:
train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [24]:
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [25]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [26]:
#Cleaning

In [27]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [28]:
train[train['Age'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q


In [29]:
train['Age'].fillna(train['Age'].mean(),inplace = True)

In [30]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [31]:
train.groupby('Pclass').mean()

Unnamed: 0_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,461.597222,0.62963,37.048118,0.416667,0.356481,84.154687
2,445.956522,0.472826,29.866958,0.402174,0.380435,20.662183
3,439.154786,0.242363,26.403259,0.615071,0.393075,13.67555


In [32]:
train.groupby(['Pclass','Sex']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,female,469.212766,0.968085,34.141405,0.553191,0.457447,106.125798
1,male,455.729508,0.368852,39.287717,0.311475,0.278689,67.226127
2,female,443.105263,0.921053,28.748661,0.486842,0.605263,21.970121
2,male,447.962963,0.157407,30.653908,0.342593,0.222222,19.741782
3,female,399.729167,0.5,24.068493,0.895833,0.798611,16.11881
3,male,455.51585,0.135447,27.372153,0.498559,0.224784,12.661633


In [33]:
#concat

In [34]:
first_5 = train.head()

In [35]:
last_5 = train.tail()

In [36]:
combined = pd.concat([first_5,last_5],axis = 0)

In [37]:
combined

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [38]:
data = [['Braund, Mr. Owen Harris', 80, 177.0], ['Heikkinen, Miss. Laina', 78, 180.0], ['Montvila, Rev. Juozas', 87, 165.0]] 

In [39]:
df2 = pd.DataFrame(data,columns = ['Name','weight','height'] )

In [40]:
data = [['thomas', 100], ['nicholas', 200], ['danson', 300]] 
df = pd.DataFrame(data, columns = ['Name', 'Age'])

In [41]:
df3 = pd.merge(train,df2, how = 'right', on = 'Name')

In [42]:
df3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,weight,height
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,80,177.0
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78,180.0
2,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,87,165.0


In [43]:
df4 = pd.merge(df2,train, how = 'left', on = 'Name')

In [44]:
df4

Unnamed: 0,Name,weight,height,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",80,177.0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,"Heikkinen, Miss. Laina",78,180.0,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
2,"Montvila, Rev. Juozas",87,165.0,887,0,2,male,27.0,0,0,211536,13.0,,S


In [45]:
#Data manipulation

In [46]:
df2['weight'].astype(float)

0    80.0
1    78.0
2    87.0
Name: weight, dtype: float64

In [47]:
#Applying function

In [48]:
def pclass_name(x):
    if x == 1:
        x = '1st Class'
    elif x == 2:
        x = '2nd Class'
    elif x == 3:
        x = '3rd Class'
    return x

In [49]:
df3['Pclass'] = df3['Pclass'].apply(lambda x: pclass_name(x))

In [50]:
df3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,weight,height
0,1,0,3rd Class,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,80,177.0
1,3,1,3rd Class,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78,180.0
2,887,0,2nd Class,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,87,165.0


In [51]:
def height_convert(x):
    x = (x/100)**2
    return x
df3['height'] = df3['height'].apply(lambda x: height_convert(x))
    

In [52]:
df3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,weight,height
0,1,0,3rd Class,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,80,3.1329
1,3,1,3rd Class,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78,3.24
2,887,0,2nd Class,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,87,2.7225


In [53]:
df3['Bmi'] = df3['weight']/df3['height']

In [54]:
df3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,weight,height,Bmi
0,1,0,3rd Class,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,80,3.1329,25.535446
1,3,1,3rd Class,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78,3.24,24.074074
2,887,0,2nd Class,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,87,2.7225,31.955923


In [55]:
df3.to_csv(r'C:\Users\Chaitali\Desktop\Python\Pandas\df3.csv')