In [None]:
#importing all libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import joblib

df = pd.read_csv('filteredData.csv')


In [None]:
#function which will augment the data or increases our data set
def transform_row(row):
    value = row['bought_in_previous_month']
    rows = []
    if value < 100:
        start = (value // 10) * 10 + 10
        for i in range(value, start, 1):
            new_row = row.copy()
            new_row['bought_in_previous_month'] = i
            rows.append(new_row)
    elif value < 1000:
        start = (value // 100) * 100 + 100
        for i in range(value, start, 1):
            new_row = row.copy()
            new_row['bought_in_previous_month'] = i
            rows.append(new_row)
    else:
        start = (value // 1000 + 1) * 1000
        for i in range(value, start, 1):
            new_row = row.copy()
            new_row['bought_in_previous_month'] = i
            rows.append(new_row)
    return rows

In [5]:
#iterating. labelling the data and then saving the csv in modified_file.csv
new_rows = []
for _, row in df.iterrows():
    # if row['bought_in_previous_month'] < 70 and row['Pic']=="https://m.media-amazon.com/images/I/611AgBIy6LL._AC_UL320_.jpg":
    new_rows.extend(transform_row(row))

new_df = pd.DataFrame(new_rows)
new_df['trending'] = ((new_df['bought_in_previous_month'] >= 500) & (new_df['rating'] >= 4)).astype(int)

new_df.to_csv('modified_file.csv', index=False)


In [6]:

new_df.describe()

Unnamed: 0,rating,Unnamed: 5,bought_in_previous_month,trending
count,784260.0,0.0,784260.0,784260.0
mean,4.445972,,3651.646355,0.908627
std,0.223819,,4372.961132,0.288139
min,2.7,,50.0,0.0
25%,4.3,,1556.0,1.0
50%,4.5,,2891.0,1.0
75%,4.6,,3988.0,1.0
max,5.0,,70999.0,1.0


In [None]:
 # Features and target
X = new_df[['rating','bought_in_previous_month']] 
y = new_df['trending']

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
#splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('split done')

split done


In [10]:
#training the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print(model)

RandomForestClassifier(random_state=42)


In [None]:
#testing the model with test data that we splitted earlier
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     14525
           1       1.00      1.00      1.00    142327

    accuracy                           1.00    156852
   macro avg       1.00      1.00      1.00    156852
weighted avg       1.00      1.00      1.00    156852



In [12]:
#testing with below data frame
new_data = pd.DataFrame({
    'rating': [4,1],
    'bought_in_previous_month': [500,50] 
})
predictions = model.predict(new_data)
print(predictions)

[1 0]


In [13]:
#saveing the model in model.pkl
joblib.dump(model, 'model.pkl')
print('model saved')

model saved


In [2]:
#loading the model
model = joblib.load('model.pkl')

In [3]:
#testing the loaded model with below data
input_data = pd.DataFrame({
    'rating': [4,1],
    'bought_in_previous_month': [500,50] 
})
predictions = model.predict(input_data)
print(predictions)

[1 0]
