In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load pre-COVID and post-COVID data
pre_covid_data = pd.read_csv('pre_covid_data(1).csv')
post_covid_data = pd.read_csv('covid_data(1).csv')

# Combine the data for a unified analysis
data = pd.concat([pre_covid_data, post_covid_data])

# Display the first few rows of the combined data
print(data.head())



                        App Name        Category  Rating  Reviews        Size  \
0  Colorfit - Drawing & Coloring  ART_AND_DESIGN     4.7    20260  25000000.0   
1  Colorfit - Drawing & Coloring  ART_AND_DESIGN     4.7    20260  25000000.0   
2  Colorfit - Drawing & Coloring  ART_AND_DESIGN     4.7    20260  25000000.0   
3  Colorfit - Drawing & Coloring  ART_AND_DESIGN     4.7    20260  25000000.0   
4  Colorfit - Drawing & Coloring  ART_AND_DESIGN     4.7    20260  25000000.0   

   Installs  Type  Price Content Rating                   Genres  ...  \
0    500000  Free    0.0       Everyone  Art & Design;Creativity  ...   
1    500000  Free    0.0       Everyone  Art & Design;Creativity  ...   
2    500000  Free    0.0       Everyone  Art & Design;Creativity  ...   
3    500000  Free    0.0       Everyone  Art & Design;Creativity  ...   
4    500000  Free    0.0       Everyone  Art & Design;Creativity  ...   

    Android Ver                                  Translated_Review Sentime

In [2]:
print(data.isnull().sum())

App Name                       0
Category                       0
Rating                         0
Reviews                        0
Size                      112233
Installs                       0
Type                           0
Price                          0
Content Rating                 0
Genres                         0
Last Updated                   0
Current Ver                    0
Android Ver                    0
Translated_Review           7157
Sentiment                   7157
Sentiment_Polarity          7157
Sentiment_Subjectivity      7157
Review Date                    0
content                        0
sentiment                      0
polarity                       0
subjectivity                   0
dtype: int64


In [3]:
print(data.columns)

Index(['App Name', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity',
       'Sentiment_Subjectivity', 'Review Date', 'content', 'sentiment',
       'polarity', 'subjectivity'],
      dtype='object')


## Data Handling

In [4]:
# Define the target column
target_column = 'Rating'

# Separate features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [5]:
# Display the shape of X and y to confirm
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (118767, 21)
Shape of y: (118767,)


In [6]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

## Preprocessing

In [7]:
from sklearn.impute import SimpleImputer

# Numerical data preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical data preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

Training data shape: (95013, 21)
Test data shape: (23754, 21)


## SVM Models

### Linear Kernel SVM

In [13]:
# Pipeline for Linear Kernel SVM
pipeline_linear = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', SVC(kernel='linear'))])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Kernel SVM model
print("Training Linear Kernel SVM...")
pipeline_linear.fit(X_train, y_train)

Training Linear Kernel SVM...


### Polynomial Kernel SVM

In [10]:
# Pipeline for Polynomial Kernel SVM
pipeline_poly = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', SVC(kernel='poly', degree=3))])

# Train the Polynomial Kernel SVM model
print("Training Polynomial Kernel SVM...")
pipeline_poly.fit(X_train, y_train)


Training Polynomial Kernel SVM...


### RBF Kernel SVM

In [11]:
# Pipeline for RBF Kernel SVM
pipeline_rbf = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(kernel='rbf'))])

# Train the RBF Kernel SVM model
print("Training RBF Kernel SVM...")
pipeline_rbf.fit(X_train, y_train)

Training RBF Kernel SVM...


### Model Evaluation

In [14]:
# Make predictions
y_pred_linear = pipeline_linear.predict(X_test)
y_pred_poly = pipeline_poly.predict(X_test)
y_pred_rbf = pipeline_rbf.predict(X_test)

# Evaluate Linear Kernel SVM
print("Linear Kernel SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_linear))
print(classification_report(y_test, y_pred_linear))

# Evaluate Polynomial Kernel SVM
print("Polynomial Kernel SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_poly))
print(classification_report(y_test, y_pred_poly))

# Evaluate RBF Kernel SVM
print("RBF Kernel SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rbf))
print(classification_report(y_test, y_pred_rbf))

Linear Kernel SVM Results:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00      3441
           2       1.00      1.00      1.00       128
           3       1.00      1.00      1.00       275
           4       1.00      1.00      1.00      8842
           5       1.00      1.00      1.00       853
           6       1.00      1.00      1.00        48
           7       1.00      1.00      1.00      9924
           8       1.00      1.00      1.00       227
           9       1.00      1.00      1.00         7

    accuracy                           1.00     23754
   macro avg       1.00      1.00      1.00     23754
weighted avg       1.00      1.00      1.00     23754

Polynomial Kernel SVM Results:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00   