In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, Lasso

<h2> Feature Selection on OnlineNewsPopularity </h2>

<h3>Exercise</h3>


1. Load the "OnlineNewsPopularity.csv" dataset 
2. Drop the Column which isn't required


In [2]:
 # read the csv file from the link provided
 # drop the column that is not required from the dataset(url)

online_popularity_data = pd.read_csv('OnlineNewsPopularity.csv')
# drop the 'url' column

online_popularity_data.drop('url', axis=1, inplace=True)

In [3]:
# strip the whitespace in the column names
online_popularity_data.columns = online_popularity_data.columns.str.strip()

<h3>Exercise</h3>


1. Scale the data using a appropriate scaler and re-asign the column names after scaling.
2. The function below should return scaled result in the form of DataFrame

If you are unfamiliar with preprocessing, you use the links below as a reference before you start working on the exercise.

### https://scikit-learn.org/stable/modules/preprocessing.html
### https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
### https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/

In [4]:
# hint: Use MinMaxScaler for scaling
def scale_data(data):
    # store all the columns
    
    # create a scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # fit and transform the data
    transformed_data = pd.DataFrame(scaler.fit_transform(online_popularity_data), 
                                    columns=online_popularity_data.columns)
    
    # store the transformed data in a dataframe and return it.
    
    return transformed_data


In [5]:
transformed_data = scale_data(online_popularity_data)
transformed_data.head()

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,1.0,0.47619,0.025844,0.000947,0.00096,0.001254,0.013158,0.017241,0.007812,0.0,...,0.1,0.7,0.65,0.4,0.8,0.5,0.40625,0.0,0.1875,0.000702
1,1.0,0.333333,0.030092,0.000863,0.00096,0.001218,0.009868,0.008621,0.007812,0.0,...,0.033333,0.7,0.88125,0.875,0.9,0.0,0.5,1.0,0.0,0.000842
2,1.0,0.333333,0.0249,0.00082,0.00096,0.001021,0.009868,0.008621,0.007812,0.0,...,0.1,1.0,0.533333,0.2,0.866667,0.0,0.5,1.0,0.0,0.001778
3,1.0,0.333333,0.062662,0.000719,0.00096,0.001024,0.029605,0.0,0.007812,0.0,...,0.136364,0.8,0.630303,0.4,0.833333,0.0,0.5,1.0,0.0,0.001422
4,1.0,0.52381,0.126505,0.000593,0.00096,0.000832,0.0625,0.163793,0.15625,0.0,...,0.033333,1.0,0.779808,0.5,0.95,0.454545,0.568182,0.090909,0.136364,0.000598


5. Perform train_test_split

In [6]:
# select all rows of all columns except the column 'shares'
X = transformed_data.loc[:, :'abs_title_sentiment_polarity']

# the column we are going to classify
y = transformed_data['shares']

# do the train test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

# convert y_train and y_test into dataframes
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

y_train = y_train.astype('int')

6. Write a function which returns the list of k-Best features where k being the number of features required

You can use links below as a reference.

### https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest

### https://www.datatechnotes.com/2021/02/seleckbest-feature-selection-example-in-python.html

In [7]:
#use chi2
def get_k_best_features(X_train, y_train, k):
    
    # use SelectKBest class to get k best features
    X_best_features = SelectKBest(score_func=chi2, k=k).fit(X_train, y_train)
    
    # will return boolean indices
    best_features_indices = X_best_features.get_support()
    
    best_features = []
    
    data_columns = X_train.columns
    
    for index, bool_value in enumerate(best_features_indices):
        # append the best features to the best_features list
        best_features.append(data_columns[index])
        
    return best_features



Print the results

In [8]:
features_one = get_k_best_features(X_train, y_train, 6)        
features_two = get_k_best_features(X_train, y_train, 12)

In [9]:
features_one

['timedelta',
 'n_tokens_title',
 'n_tokens_content',
 'n_unique_tokens',
 'n_non_stop_words',
 'n_non_stop_unique_tokens',
 'num_hrefs',
 'num_self_hrefs',
 'num_imgs',
 'num_videos',
 'average_token_length',
 'num_keywords',
 'data_channel_is_lifestyle',
 'data_channel_is_entertainment',
 'data_channel_is_bus',
 'data_channel_is_socmed',
 'data_channel_is_tech',
 'data_channel_is_world',
 'kw_min_min',
 'kw_max_min',
 'kw_avg_min',
 'kw_min_max',
 'kw_max_max',
 'kw_avg_max',
 'kw_min_avg',
 'kw_max_avg',
 'kw_avg_avg',
 'self_reference_min_shares',
 'self_reference_max_shares',
 'self_reference_avg_sharess',
 'weekday_is_monday',
 'weekday_is_tuesday',
 'weekday_is_wednesday',
 'weekday_is_thursday',
 'weekday_is_friday',
 'weekday_is_saturday',
 'weekday_is_sunday',
 'is_weekend',
 'LDA_00',
 'LDA_01',
 'LDA_02',
 'LDA_03',
 'LDA_04',
 'global_subjectivity',
 'global_sentiment_polarity',
 'global_rate_positive_words',
 'global_rate_negative_words',
 'rate_positive_words',
 'rate_ne

In [10]:
features_two

['timedelta',
 'n_tokens_title',
 'n_tokens_content',
 'n_unique_tokens',
 'n_non_stop_words',
 'n_non_stop_unique_tokens',
 'num_hrefs',
 'num_self_hrefs',
 'num_imgs',
 'num_videos',
 'average_token_length',
 'num_keywords',
 'data_channel_is_lifestyle',
 'data_channel_is_entertainment',
 'data_channel_is_bus',
 'data_channel_is_socmed',
 'data_channel_is_tech',
 'data_channel_is_world',
 'kw_min_min',
 'kw_max_min',
 'kw_avg_min',
 'kw_min_max',
 'kw_max_max',
 'kw_avg_max',
 'kw_min_avg',
 'kw_max_avg',
 'kw_avg_avg',
 'self_reference_min_shares',
 'self_reference_max_shares',
 'self_reference_avg_sharess',
 'weekday_is_monday',
 'weekday_is_tuesday',
 'weekday_is_wednesday',
 'weekday_is_thursday',
 'weekday_is_friday',
 'weekday_is_saturday',
 'weekday_is_sunday',
 'is_weekend',
 'LDA_00',
 'LDA_01',
 'LDA_02',
 'LDA_03',
 'LDA_04',
 'global_subjectivity',
 'global_sentiment_polarity',
 'global_rate_positive_words',
 'global_rate_negative_words',
 'rate_positive_words',
 'rate_ne

<h3>=> Describe about feature selection and expalin your code in detail?</h3>


<h2> Model selection on Algerian_forest_fires_dataset_UPDATE-1 dataset  </h2>
<h3>Exercise (Hint use Ridge and Lasso to compare the models.)</h3>

<p>Your task is to findout which of the above models is best suited for the given dataset and give reasons in this scenario. </p>
<p>Also, you need to give scenarios which each of these Models work better over the other.</p>


1. Load the dataset

In [49]:
# load the dataset and use skiprows argument to skip the first row
algerian_forest_data = pd.read_csv('Algerian_forest_fires_dataset_UPDATE-1.csv', skiprows = 1 )
algerian_forest_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [50]:
# convert the column names to lowercase and remove the white spaces in the column name
# this step is optional
algerian_forest_data.columns = [column.strip() for column in algerian_forest_data.columns.str.lower()]

In [51]:
data_frame = algerian_forest_data.loc[124:] 

data_frame = pd.concat([algerian_forest_data.loc[:120], algerian_forest_data.loc[125:]])

# remove the white spaces in the 'classes' column
data_frame['classes'] = data_frame['classes'].str.strip()

### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

### https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python/

### https://stackoverflow.com/questions/66056695/what-does-labelencoder-fit-do

In [52]:
# create an instance of LabelEncoder
label_encoder = LabelEncoder()

label_encoder.fit(data_frame['classes'])

data_frame['classes'] = label_encoder.transform(data_frame['classes'])
data_frame = data_frame.drop(index=[167])

data_frame

Unnamed: 0,day,month,year,temperature,rh,ws,rain,ffmc,dmc,dc,isi,bui,fwi,classes
0,01,06,2012,29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,1
1,02,06,2012,29,61,13,1.3,64.4,4.1,7.6,1,3.9,0.4,1
2,03,06,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1
3,04,06,2012,25,89,13,2.5,28.6,1.3,6.9,0,1.7,0,1
4,05,06,2012,27,77,16,0,64.8,3,14.2,1.2,3.9,0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,26,09,2012,30,65,14,0,85.4,16,44.5,4.5,16.9,6.5,0
242,27,09,2012,28,87,15,4.4,41.1,6.5,8,0.1,6.2,0,1
243,28,09,2012,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,1
244,29,09,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,1


* Drop the unnecessary columns and use train_test_split

In [23]:
# drop the columns ['classes', 'day', 'month', 'year'] and store the data frame in X
X = data_frame.drop(columns=['classes', 'day', 'month', 'year'])

# store the column we are going to classify , i.e., 'classes'
y = data_frame['classes']

# fit and transform the data using StandardScaler
scalar = StandardScaler()
X = scalar.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### perform Ridge regularization 
### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [19]:
# create an instance of the Ridge class with alpha=1.0
ridge = Ridge(alpha=1.0)

# train the Ridge model on the training data
ridge.fit(X_train, y_train)

# make predictions on the testing data
y_pred = ridge.predict(X_test)

### perform Lasso regularization 
### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

In [20]:
# create an instance of the Lasso class with alpha=1.0
lasso = Lasso(alpha=1.0)

# train the Lasso model on the training data
lasso.fit(X_train, y_train)

# make predictions on the testing data
y_pred = lasso.predict(X_test)