# General Codes

## Imports

In [0]:
import pandas as pd
import numpy as np

In [0]:
d = {'Name': ['A','B','B','C','C','C'] , 'Place': [1, 2, np.NaN, 3, 3, 0] , 'Animal': ['y','n',np.NaN, np.NaN, np.NaN, np.NaN] , 'Thing' : [True,False,True,False,False,True]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
4,C,3.0,,False
5,C,0.0,,True


## Prints

In [0]:
print('The data contains', len(df), 'rows')

The data contains 6 rows


In [0]:
print('The shape of the data is {} rows and {} columns' .format(len(df),df.shape[1]) )

The shape of the data is 6 rows and 4 columns


# Exploratory Data Analysis

## Data Structure

In [0]:
df.head(2)

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
Name      6 non-null object
Place     5 non-null float64
Animal    2 non-null object
Thing     6 non-null bool
dtypes: bool(1), float64(1), object(2)
memory usage: 230.0+ bytes


In [0]:
df.dtypes

Name       object
Place     float64
Animal     object
Thing        bool
dtype: object

In [0]:
df.describe()

Unnamed: 0,Place
count,5.0
mean,1.8
std,1.30384
min,0.0
25%,1.0
50%,2.0
75%,3.0
max,3.0


In [0]:
df.index

RangeIndex(start=0, stop=6, step=1)

## Filter Data Structures

In [0]:
(df.dtypes == object).sum()

2

In [0]:
df.select_dtypes(include=['object'])

Unnamed: 0,Name,Animal
0,A,y
1,B,n
2,B,
3,C,
4,C,
5,C,


In [0]:
df

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
4,C,3.0,,False
5,,0.0,,True


## Which Data 

In [0]:
df.loc[ df['Place'].idxmax() ]

Name          C
Place         3
Animal      NaN
Thing     False
Name: 3, dtype: object

In [0]:
df[ df['Name'].str.contains('C') ]

Unnamed: 0,Name,Place,Animal,Thing
3,C,3.0,,False
4,C,3.0,,False
5,C,0.0,,True


## Missing Values

In [0]:
# List all columns with/without missing values
df.isnull().sum()       # len(df) - df.count() ::  same output 

Name      1
Place     1
Animal    3
Thing     0
dtype: int64

In [0]:
# Which columns have how many missing values?
df.isnull().sum() [df.isnull().sum() >0]

Name      1
Place     1
Animal    3
dtype: int64

In [0]:
# Set of Columns with 0 missing values
set(df.columns[df.isnull().mean()==0])  #Change ==0 to > 0.75 for columns with more than 75% values missing

{'Thing'}

In [0]:
# Which ROWS have Missing Values
df[df.isnull().any(axis=1)]

Unnamed: 0,Name,Place,Animal,Thing
2,B,,,True
3,C,3.0,,False
5,,0.0,,True


In [0]:
# How many Non Null Values in Column?
np.sum(df.Animal.notnull())

3

## Duplicates

In [0]:
# How many duplicate ROWS in the entire data
df.duplicated().sum()

1

In [0]:
# Which ROWS have duplicate Names
df[ df['Name'].duplicated()]

Unnamed: 0,Name,Place,Animal,Thing
2,B,,,True
4,C,3.0,,False


In [0]:
# DROP DUPLICATES
df.drop_duplicates()	# use inplace = True if required

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
5,,0.0,,True


In [0]:
df

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
4,C,3.0,,False
5,,0.0,,True


## Aggregates

In [0]:
df['Name'].value_counts()   #Whats the aggregate count of Name?  df.x.value_counts()


C    2
B    2
A    1
Name: Name, dtype: int64

# Feature Engineering

## Columns

In [0]:
df.rename(columns={'Name':'Names' , 'Animal':'Animals'})  # Use 'inplace=True' for changes

Unnamed: 0,Names,Place,Animals,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
4,C,3.0,,False
5,C,0.0,,True


In [0]:
df.drop( ['Name', 'Animal'], axis=1)   # Use 'inplace=True' for changes

Unnamed: 0,Place,Thing
0,1.0,True
1,2.0,False
2,,True
3,3.0,False
4,3.0,False
5,0.0,True


In [0]:
df[ ['Name','Animal'] ]

Unnamed: 0,Name,Animal
0,A,y
1,B,n
2,B,
3,C,
4,C,
5,C,


## Replace Values

In [0]:
# Replace values MANUALLY
df.Animal.map({'n':0, 'y':1 , np.NaN:0.5})

0    1.0
1    0.0
2    0.5
3    0.5
4    0.5
5    0.5
Name: Animal, dtype: float64

### Impute Missing Values

In [0]:
df['Animal'].fillna('o') #:: Replace missing values with another value

0    y
1    n
2    o
3    o
4    o
5    o
Name: Animal, dtype: object

In [0]:
df['Place'].replace(0,df['Place'].mean()) # Impute 0 values with mean

0    1.0
1    2.0
2    NaN
3    3.0
4    3.0
5    1.8
Name: Place, dtype: float64

### Drop Missing Values

In [0]:
df.dropna(axis=0, how='any') # Replace how='all' to remove Rows containing all Missing Values

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False


In [0]:
df.dropna(axis=1)

Unnamed: 0,Name,Thing
0,A,True
1,B,False
2,B,True
3,C,False
4,C,False
5,C,True


In [0]:
df.dropna(subset=['Name'], how='any')

Unnamed: 0,Name,Place,Animal,Thing
0,A,1.0,y,True
1,B,2.0,n,False
2,B,,,True
3,C,3.0,,False
4,C,3.0,,False
5,C,0.0,,True


## Keep/Drop Columns