<a href="https://colab.research.google.com/github/codeprogredire/data-science/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used for explaining the steps involved in creating a K-Means Clustering model

#Import the required libraries

In [87]:
from sklearn.datasets import make_blobs #for generating datapoints
import numpy as np #for dealing with arrays and performing algebraic operations
import matplotlib.pyplot as plt #for plotting curves
import pandas as pd #for data manipulation
import math #for performing mathematical operations
import zipfile #for extracting zip files

#Downloading Facebook metric dataset

In [88]:
import gdown
file_id='1NA5vEJvv3HB9jhMUIaIXGLHq1__hT_-n'
output_file='fb-metrics.zip'
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_file)

Downloading...
From: https://drive.google.com/uc?id=1NA5vEJvv3HB9jhMUIaIXGLHq1__hT_-n
To: /content/fb-metrics.zip
100%|██████████| 15.4k/15.4k [00:00<00:00, 20.3MB/s]


'fb-metrics.zip'

In [89]:
with zipfile.ZipFile("/content/fb-metrics.zip","r") as zip_ref:         #Extracting the data
    zip_ref.extractall("/content")

#Read the Facebook Metric Dataset

In [90]:
df=pd.read_csv('dataset_Facebook.csv',delimiter=';')

In [91]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


#Convert the columns to one-hot vectors

Converting the categorical data of Post Month into one-hot vector

In [92]:
months=pd.get_dummies(df['Post Month'],prefix='Month') #This converts the categorical Post Month data into multiple one-hot vectors
df=pd.concat([df,months],axis=1) #concats the month one-hot vectors with the dataframe
df=df.drop('Post Month',axis=1) #drops Post Month column

In [93]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,139441,Photo,2,4,3,0.0,2752,5091,178,109,...,0,0,0,0,0,0,0,0,0,1
1,139441,Status,2,3,10,0.0,10460,19057,1457,1361,...,0,0,0,0,0,0,0,0,0,1
2,139441,Photo,3,3,3,0.0,2413,4373,177,113,...,0,0,0,0,0,0,0,0,0,1
3,139441,Photo,2,2,10,1.0,50128,87991,2211,790,...,0,0,0,0,0,0,0,0,0,1
4,139441,Photo,2,2,3,0.0,7244,13594,671,410,...,0,0,0,0,0,0,0,0,0,1


Converting the categorical Weekday column into one-hot vector

In [94]:
days=pd.get_dummies(df['Post Weekday'],prefix='Day') #This converts the categorical Post Month data into multiple one-hot vectors
df=pd.concat([df,days],axis=1) #concats the Day one-hot vectors with the dataframe
df=df.drop('Post Weekday',axis=1) #drops Post Weekday column

In [95]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,...,Month_10,Month_11,Month_12,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7
0,139441,Photo,2,3,0.0,2752,5091,178,109,159,...,0,0,1,0,0,0,1,0,0,0
1,139441,Status,2,10,0.0,10460,19057,1457,1361,1674,...,0,0,1,0,0,1,0,0,0,0
2,139441,Photo,3,3,0.0,2413,4373,177,113,154,...,0,0,1,0,0,1,0,0,0,0
3,139441,Photo,2,10,1.0,50128,87991,2211,790,1119,...,0,0,1,0,1,0,0,0,0,0
4,139441,Photo,2,3,0.0,7244,13594,671,410,580,...,0,0,1,0,1,0,0,0,0,0


In [96]:
len(df.columns)

36

#Convert the categorical data of Post Hour into one-hot vector

In [97]:
hours=pd.get_dummies(df['Post Hour'],prefix='Time') #This converts the categorical Post Hour data into multiple one-hot vectors
df=pd.concat([df,hours],axis=1) #concats the Hour one-hot vectors with the dataframe
df=df.drop('Post Hour',axis=1) #drops Post Hour column

In [98]:
df.head()

Unnamed: 0,Page total likes,Type,Category,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,...,Time_13,Time_14,Time_15,Time_16,Time_17,Time_18,Time_19,Time_20,Time_22,Time_23
0,139441,Photo,2,0.0,2752,5091,178,109,159,3078,...,0,0,0,0,0,0,0,0,0,0
1,139441,Status,2,0.0,10460,19057,1457,1361,1674,11710,...,0,0,0,0,0,0,0,0,0,0
2,139441,Photo,3,0.0,2413,4373,177,113,154,2812,...,0,0,0,0,0,0,0,0,0,0
3,139441,Photo,2,1.0,50128,87991,2211,790,1119,61027,...,0,0,0,0,0,0,0,0,0,0
4,139441,Photo,2,0.0,7244,13594,671,410,580,6228,...,0,0,0,0,0,0,0,0,0,0


Converting the castegorical data of Category into one-hot vector

In [99]:
categories = pd.get_dummies(df['Category'], prefix="Category")       # Converting the Category categorical data into multiple one-hot vectors
df = pd.concat([df, categories],axis=1)                      # Concatenating the Category one-hot vectors to data_frame
df = df.drop('Category',axis=1)                             # Dropping Category column

In [100]:
df.head()

Unnamed: 0,Page total likes,Type,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,...,Time_16,Time_17,Time_18,Time_19,Time_20,Time_22,Time_23,Category_1,Category_2,Category_3
0,139441,Photo,0.0,2752,5091,178,109,159,3078,1640,...,0,0,0,0,0,0,0,0,1,0
1,139441,Status,0.0,10460,19057,1457,1361,1674,11710,6112,...,0,0,0,0,0,0,0,0,1,0
2,139441,Photo,0.0,2413,4373,177,113,154,2812,1503,...,0,0,0,0,0,0,0,0,0,1
3,139441,Photo,1.0,50128,87991,2211,790,1119,61027,32048,...,0,0,0,0,0,0,0,0,1,0
4,139441,Photo,0.0,7244,13594,671,410,580,6228,3200,...,0,0,0,0,0,0,0,0,1,0


**Converting the categorical data of Type into one hot vector**

In [101]:
types = pd.get_dummies(df['Type'], prefix="Type")                     # Converting the Type categorical data into multiple one-hot vectors
df = pd.concat([df, types],axis=1)                            # Concatenating the Type one-hot vectors to data_frame
df = df.drop('Type',axis=1)                                   # Dropping Type column

In [102]:
df.head()

Unnamed: 0,Page total likes,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,...,Time_20,Time_22,Time_23,Category_1,Category_2,Category_3,Type_Link,Type_Photo,Type_Status,Type_Video
0,139441,0.0,2752,5091,178,109,159,3078,1640,119,...,0,0,0,0,1,0,0,1,0,0
1,139441,0.0,10460,19057,1457,1361,1674,11710,6112,1108,...,0,0,0,0,1,0,0,0,1,0
2,139441,0.0,2413,4373,177,113,154,2812,1503,132,...,0,0,0,0,0,1,0,1,0,0
3,139441,1.0,50128,87991,2211,790,1119,61027,32048,1386,...,0,0,0,0,1,0,0,1,0,0
4,139441,0.0,7244,13594,671,410,580,6228,3200,396,...,0,0,0,0,1,0,0,1,0,0


In [103]:
len(df.columns)

62

In [104]:
list(df.columns)

['Page total likes',
 'Paid',
 'Lifetime Post Total Reach',
 'Lifetime Post Total Impressions',
 'Lifetime Engaged Users',
 'Lifetime Post Consumers',
 'Lifetime Post Consumptions',
 'Lifetime Post Impressions by people who have liked your Page',
 'Lifetime Post reach by people who like your Page',
 'Lifetime People who have liked your Page and engaged with your post',
 'comment',
 'like',
 'share',
 'Total Interactions',
 'Month_1',
 'Month_2',
 'Month_3',
 'Month_4',
 'Month_5',
 'Month_6',
 'Month_7',
 'Month_8',
 'Month_9',
 'Month_10',
 'Month_11',
 'Month_12',
 'Day_1',
 'Day_2',
 'Day_3',
 'Day_4',
 'Day_5',
 'Day_6',
 'Day_7',
 'Time_1',
 'Time_2',
 'Time_3',
 'Time_4',
 'Time_5',
 'Time_6',
 'Time_7',
 'Time_8',
 'Time_9',
 'Time_10',
 'Time_11',
 'Time_12',
 'Time_13',
 'Time_14',
 'Time_15',
 'Time_16',
 'Time_17',
 'Time_18',
 'Time_19',
 'Time_20',
 'Time_22',
 'Time_23',
 'Category_1',
 'Category_2',
 'Category_3',
 'Type_Link',
 'Type_Photo',
 'Type_Status',
 'Type_Video

#Removing the NULL values

**Function to find columns having NULL values**

In [105]:
def check_for_columns_with_null_values(data_frame):                 # This function find columns with null values and returns a list of such columns
  column_headers=list(data_frame.columns)                           # To find list of all column headers
  columns_with_null_values = []
  for c in column_headers:                                          # Iterating over all the columns
    if(data_frame[c].isnull().values.any()==True):                  # Checking condition for column having null values
      columns_with_null_values.append(c)                            # Appending column with null values to this list
  return columns_with_null_values

In [106]:
columns_with_null_values = check_for_columns_with_null_values(df)

In [107]:
columns_with_null_values

['Paid', 'like', 'share']

**Replacing the NULL values with 0 in Paid, like and share columns**

In [108]:
df['Paid'] = df['Paid'].fillna(0)    # Replacing the null values with 0
df['like'] = df['like'].fillna(0)    # Replacing the null values with 0
df['share'] = df['share'].fillna(0)    # Replacing the null values with 0

In [109]:
sum(df['Paid'].isnull().values)                 # Counting how many rows in 'Paid' column have null values

0

#Normalizing the data

In [110]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()                      # For normalizing the features

**Standardizing the features for 'Page total likes' column for various columns using StandardScaler()**

In [111]:
df['Page total likes'] = scaler.fit_transform(np.array(df['Page total likes']).reshape(-1,1))    # Standardizing the features for 'Page total likes' column

In [112]:
df['Page total likes']

0      0.999403
1      0.999403
2      0.999403
3      0.999403
4      0.999403
         ...   
495   -2.343746
496   -2.572761
497   -2.572761
498   -2.572761
499   -2.572761
Name: Page total likes, Length: 500, dtype: float64

**Standardizing other columns**

In [113]:
df['Paid'] = scaler.fit_transform(np.array(df['Paid']).reshape(-1,1))
df['Lifetime Post Total Reach'] = scaler.fit_transform(np.array(df['Lifetime Post Total Reach']).reshape(-1,1))
df['Lifetime Post Total Impressions'] = scaler.fit_transform(np.array(df['Lifetime Post Total Impressions']).reshape(-1,1))
df['Lifetime Engaged Users'] = scaler.fit_transform(np.array(df['Lifetime Engaged Users']).reshape(-1,1))
df['Lifetime Post Consumers'] = scaler.fit_transform(np.array(df['Lifetime Post Consumers']).reshape(-1,1))
df['Lifetime Post Consumptions'] = scaler.fit_transform(np.array(df['Lifetime Post Consumptions']).reshape(-1,1))
df['Lifetime Post Impressions by people who have liked your Page'] = scaler.fit_transform(np.array(df['Lifetime Post Impressions by people who have liked your Page']).reshape(-1,1))
df['Lifetime Post reach by people who like your Page'] = scaler.fit_transform(np.array(df['Lifetime Post reach by people who like your Page']).reshape(-1,1))
df['Lifetime People who have liked your Page and engaged with your post'] = scaler.fit_transform(np.array(df['Lifetime People who have liked your Page and engaged with your post']).reshape(-1,1))
df['comment'] = scaler.fit_transform(np.array(df['comment']).reshape(-1,1))
df['like'] = scaler.fit_transform(np.array(df['like']).reshape(-1,1))
df['share'] = scaler.fit_transform(np.array(df['share']).reshape(-1,1))
df['Total Interactions'] = scaler.fit_transform(np.array(df['Total Interactions']).reshape(-1,1))

**Displaying the final preprocessed dataframe**

In [114]:
df.head()

Unnamed: 0,Page total likes,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,...,Time_20,Time_22,Time_23,Category_1,Category_2,Category_3,Type_Link,Type_Photo,Type_Status,Type_Video
0,0.999403,-0.620517,-0.490859,-0.319251,-0.754391,-0.78239,-0.628507,-0.229166,-0.64442,-0.802117,...,0,0,0,0,1,0,0,1,0,0
1,0.999403,-0.620517,-0.151569,-0.137227,0.545365,0.63772,0.129526,-0.084652,-0.061698,0.813599,...,0,0,0,0,1,0,0,0,1,0
2,0.999403,-0.620517,-0.505781,-0.328608,-0.755407,-0.777853,-0.631009,-0.23362,-0.662272,-0.780879,...,0,0,0,0,0,1,0,1,0,0
3,0.999403,1.611559,1.594532,0.761212,1.311601,-0.00995,-0.148169,0.740997,3.317884,1.267764,...,0,0,0,0,1,0,0,1,0,0
4,0.999403,-0.620517,-0.293131,-0.208428,-0.25339,-0.440974,-0.417859,-0.17643,-0.441145,-0.349586,...,0,0,0,0,1,0,0,1,0,0


#Applying K Means clustering

In [115]:
from sklearn.cluster import KMeans

In [124]:
kmeans = KMeans(n_clusters=8,n_init='auto',random_state=42)  # You can change the number of clusters as per your requirementm

In [125]:
kmeans.fit(df)  # df is the preprocessed data

In [126]:
n=len(df)
n

500

In [127]:
import random
random_index=int(random.random()*500)
random_index

492

In [128]:
random_sample=np.array(df.iloc[random_index,:]).reshape(1,-1)
random_sample

array([[-2.28924443, -0.62051716,  1.40930494, -0.30990567, -0.17005966,
        -0.16307668, -0.32629463, -0.01674796,  0.53874612, -0.63711482,
        -0.35359639, -0.15360113, -0.42496671, -0.19776087,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ]])

In [129]:
prediction=kmeans.predict(random_sample)
prediction



array([4], dtype=int32)