In [None]:
# Counts number of occurrences(distribution) of each element.
train_df['Age'].value_counts()

In [None]:
# Assigning at particular location using loc
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0

In [None]:
# Checking value of an attribute for 'null' in other attribute.
df[df['Cabin'].isnull()==True]['Pclass'].unique()

In [None]:
# Total missing values with percentage
total=train_df.isnull().sum()
total=total[total.values!=0].sort_values(ascending=False)
perce=train_df.isnull().sum()/train_df.isnull().count()*100
perce=perce[perce.values!=0].sort_values(ascending=False)
pd.concat([total,perce],axis=1, keys=['Total','Percentage(%)'])

In [None]:
# 1) We can apply np.isnan() on a series.
# 2) We can insert specific range of data at a location of same range.
ser=pd.Series([10,20,30])
ser[np.isnan([1,2, np.nan])]
age_slice = dataset["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age #(rand_age has same range)

In [None]:
# astype to be invoked on column
dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
# Usage of dictionary to fill specific v-alues.
genders = {"male": 0, "female": 1}
dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
# fillna() for all the na's
dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
# Loading data from a particular location into local path & unzipping it.

import os
import tarfile
from six.moves import urllib

root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
path = os.path.join("datasets", "housing")
source = root + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=source, path=path):
    if not os.path.isdir(path):
        os.makedirs(path)
    tgz_path = os.path.join(path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path) # urlretrieve - Retrieve a URL into a temporary location on disk.
    housing_tgz = tarfile.open(tgz_path) # Open a tar archive for reading, writing or appending. Return an appropriate TarFile class.
    housing_tgz.extractall(path=path) # This creates csv
    housing_tgz.close()


fetch_housing_data()

def load_data(housing_path=path):
    csv = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv)

housing = load_data()
housing.head()

In [None]:
# Stratified sampling

import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 0, 1, 1, 1])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(X, y) # 5
print(sss)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    # X_train, X_test = X[train_index], X[test_index]
    # y_train, y_test = y[train_index], y[test_index]
    # print(X[train_index], X[test_index])
    # print(y[train_index], y[test_index])

In [None]:
#Color scatter plot

#If kind = ‘scatter’ and the argument c is the name of a dataframe column, the values of that column are used to color each point.
#Alongwith cmap is used which brings the cmap from matplotlib.

# How points are colored?
# First of all scatter plot is plotted. Since cmap is present which has the various colors as per attribute given for 'c'.


housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, 
    label="population", 
    figsize=(10,7),
    c="median_house_value", 
    cmap=plt.get_cmap("jet"), 
    colorbar=True,
    sharex=False)
plt.legend();

In [None]:
# Gel every null (anywhere in all columns) value with its index from a dataframe 
housing.isnull().any(axis=1)

In [None]:
# get_dummies() 

# This approach is more flexible because it allows encoding as many category columns as you would like and 
# choose how to label the columns using a prefix. Proper naming will make the rest of the analysis just a little bit easier.


import pandas as pd
import numpy as np
# creating initial dataframe
bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])
# generate binary values using get_dummies
dum_df = pd.get_dummies(bridge_df, columns=["Bridge_Types"], prefix=["Type_is"] )
# dum_df
# merge with main df bridge_df on key values
bridge_df = bridge_df.join(dum_df)
bridge_df

In [None]:
# OneHotEncoder

# OneHotEncoder from SciKit library only takes numerical categorical values, 
# hence any value of string type should be label encoded before one hot encoded.

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])

enc=OneHotEncoder(handle_unknown='ignore')
# A sparse matrix only stores the location of the non zero elements and doesn't stores a matrix full of zeros 
# except for one per row. 
# This saves a lot of memory and is very useful when you have categorical attributes with thousands of categories.
spr_mat=enc.fit_transform(bridge_df[['Bridge_Types']]).toarray()
df=bridge_df.join(pd.DataFrame(spr_mat))
df

In [None]:
# OneHotEncoder without using Lable encoder but pd.factorize() method

bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])

cat_enco, dis_cato = bridge_df['Bridge_Types'].factorize()

from sklearn.preprocessing import OneHotEncoder
one_enc=OneHotEncoder()
spmat = one_enc.fit_transform(cat_enco.reshape(-1,1)).toarray()

bridge_df.join(pd.DataFrame(spmat))

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=668a0bf1-0f37-43ae-9b3e-f430ec335e7c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>