# Introduction to Pandas

Source: [https://www.datacamp.com/community/tutorials/feature-engineering-kaggle](https://www.datacamp.com/community/tutorials/feature-engineering-kaggle)

In [1]:
import pandas as pd

ModuleNotFoundError: No module named 'pandas'

### Read a CSV-File

In [None]:
df_train = pd.read_csv('train.csv')

Show the DataFrame

In [None]:
df_train

In [None]:
df_train.info()

---

#### Access a column

In [None]:
names = df_train.Name
names

In [None]:
df_train['Name']

---

#### Access a row

A row can be indexed using `loc` or `iloc`
- `loc` is label-based, which means that you have to specify rows and columns based on their row and column labels.
- `iloc` is integer position-based, so you have to specify rows and columns by their integer position values (0-based integer position).

In [None]:
df_train.loc[0]

In [None]:
df_train.iloc[0]

In [None]:
# if the name would be the index...
df_train.set_index('Name', inplace=True)
df_train

In [None]:
df_train.iloc[0]

In [None]:
# Doesn't work:
# df_train.loc[0]

In [None]:
df_train.loc["Braund, Mr. Owen Harris"]

---

## Data Cleaning

In [None]:
df_train = pd.read_csv('train.csv')
df_train

In [None]:
df_train.Cabin

**When we develop a machine learning model, we are often not interested in such detailed information as the cabin number. For example, it might be sufficient to determine whether the passenger had a cabin or not.**

In [None]:
~df_train.Cabin.isnull()

In [None]:
# save this information in a new column
df_train['Has_Cabin'] = ~df_train.Cabin.isnull()
df_train

In [None]:
# Drop the column Cabin
df_train.drop(['Cabin'], axis=1, inplace=True)
df_train

#### Drop Lines which are not useful

In [None]:
df_train.drop(['Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)
df_train

#### Handling Missing Values

In [None]:
df_train.info()

In [None]:
df_train.Age.median()

In [None]:
df_train['Age'] = df_train.Age.fillna(df_train.Age.median())
df_train['Fare'] = df_train.Fare.fillna(df_train.Fare.median())
df_train['Embarked'] = df_train.Embarked.fillna('S')

df_train

In [None]:
# Binning numerical columns
df_train['Cat_Age'] = pd.qcut(df_train.Age, q=4, labels=False)
df_train['Cat_Fare']= pd.qcut(df_train.Fare, q=4, labels=False)
df_train

In [None]:
df_train = df_train.drop(['Age', 'Fare'], axis=1)
df_train

#### Number of Members in Family Onboard

In [None]:
df_train['Fam_Size'] = df_train.Parch + df_train.SibSp
df_train

In [None]:
df_train = df_train.drop(['SibSp','Parch'], axis=1)
df_train

#### Transform Variables into Numerical Variables

In [None]:
df_train = pd.get_dummies(df_train, drop_first=True)
df_train

---
# Visualize Data

In [None]:
import seaborn as sns
sns.set_theme()

In [None]:
# df_train = pd.read_csv('train.csv')
# df_train

In [None]:
sns.barplot(x="Sex_male", y="Survived", data=df_train)

In [None]:
sns.lineplot(data=df_train, x="Cat_Age", y="Survived")