In [2]:
!pip install pymysql sqlalchemy



In [4]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import getpass  # to get the password without showing the input
password = getpass.getpass()

connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
engine = create_engine(connection_string)


········


In [11]:
film_data_query = """
select
 f.film_id,
 f.title,
 f.release_year,
 f.rental_duration,
 f.rental_rate,
 f.length,
 f.rating,
 f.special_features,
 c.name as category_name,
 count(r.rental_id) as total_rentals
from
 film f
join 
 film_category fc on f.film_id = fc.film_id
join
 category c on fc.category_id = c.category_id
left join
 inventory i  on f.film_id = i.film_id
left join
 rental r on i.inventory_id = r.inventory_id
group by
 f.film_id, f.title, f.release_year, f.rental_duration, f.rental_rate, f.length, f.rating, f.special_features, c.name;
 """
film_data = pd.read_sql(film_data_query,engine)
film_data


Unnamed: 0,film_id,title,release_year,rental_duration,rental_rate,length,rating,special_features,category_name,total_rentals
0,19,AMADEUS HOLY,2006,6,0.99,113,PG,"Commentaries,Deleted Scenes,Behind the Scenes",Action,21
1,21,AMERICAN CIRCUS,2006,3,4.99,129,R,"Commentaries,Behind the Scenes",Action,22
2,29,ANTITRUST TOMATOES,2006,5,2.99,168,NC-17,"Trailers,Commentaries,Deleted Scenes",Action,10
3,38,ARK RIDGEMONT,2006,6,0.99,68,NC-17,"Trailers,Commentaries,Deleted Scenes,Behind th...",Action,0
4,56,BAREFOOT MANCHURIAN,2006,6,2.99,129,G,"Trailers,Commentaries",Action,18
...,...,...,...,...,...,...,...,...,...,...
995,931,VALENTINE VANISHING,2006,7,0.99,48,PG-13,"Trailers,Behind the Scenes",Travel,12
996,977,WINDOW SIDE,2006,3,2.99,85,R,"Deleted Scenes,Behind the Scenes",Travel,12
997,981,WOLVES DESIRE,2006,7,0.99,55,NC-17,Behind the Scenes,Travel,21
998,988,WORKER TARZAN,2006,7,2.99,139,R,"Trailers,Commentaries,Behind the Scenes",Travel,15


In [10]:
rented_last_month_query = """
select
   f.film_id,
   case
       when count(r.rental_id) > 0 then 1
       else 0 
   end as rented_last_month
from
    film f
left join 
    inventory i on f.film_id = i.film_id
left join 
    rental r on i.inventory_id = r.inventory_id
where 
   r.rental_date between '2005-08-01' and '2005-08-31'
group by
    f.film_id;
"""
target_data = pd.read_sql(rented_last_month_query, engine)
target_data

Unnamed: 0,film_id,rented_last_month
0,683,1
1,894,1
2,941,1
3,611,1
4,281,1
...,...,...
953,718,1
954,157,1
955,32,1
956,393,1


In [13]:
data = pd.merge(film_data, target_data, on='film_id',how='left').fillna(0)
data



Unnamed: 0,film_id,title,release_year,rental_duration,rental_rate,length,rating,special_features,category_name,total_rentals,rented_last_month
0,19,AMADEUS HOLY,2006,6,0.99,113,PG,"Commentaries,Deleted Scenes,Behind the Scenes",Action,21,1.0
1,21,AMERICAN CIRCUS,2006,3,4.99,129,R,"Commentaries,Behind the Scenes",Action,22,1.0
2,29,ANTITRUST TOMATOES,2006,5,2.99,168,NC-17,"Trailers,Commentaries,Deleted Scenes",Action,10,1.0
3,38,ARK RIDGEMONT,2006,6,0.99,68,NC-17,"Trailers,Commentaries,Deleted Scenes,Behind th...",Action,0,0.0
4,56,BAREFOOT MANCHURIAN,2006,6,2.99,129,G,"Trailers,Commentaries",Action,18,1.0
...,...,...,...,...,...,...,...,...,...,...,...
995,931,VALENTINE VANISHING,2006,7,0.99,48,PG-13,"Trailers,Behind the Scenes",Travel,12,1.0
996,977,WINDOW SIDE,2006,3,2.99,85,R,"Deleted Scenes,Behind the Scenes",Travel,12,1.0
997,981,WOLVES DESIRE,2006,7,0.99,55,NC-17,Behind the Scenes,Travel,21,1.0
998,988,WORKER TARZAN,2006,7,2.99,139,R,"Trailers,Commentaries,Behind the Scenes",Travel,15,1.0


In [14]:
data = pd.get_dummies(data, columns = ['category_name', 'rating'], drop_first = True)

scaler = StandardScaler()
data[['rental_rate' , 'length' , 'rental_duration']] = scaler.fit_transform(data[['rental_rate','length','rental_duration']])
data.isnull().sum()


film_id                      0
title                        0
release_year                 0
rental_duration              0
rental_rate                  0
length                       0
special_features             0
total_rentals                0
rented_last_month            0
category_name_Animation      0
category_name_Children       0
category_name_Classics       0
category_name_Comedy         0
category_name_Documentary    0
category_name_Drama          0
category_name_Family         0
category_name_Foreign        0
category_name_Games          0
category_name_Horror         0
category_name_Music          0
category_name_New            0
category_name_Sci-Fi         0
category_name_Sports         0
category_name_Travel         0
rating_NC-17                 0
rating_PG                    0
rating_PG-13                 0
rating_R                     0
dtype: int64

In [15]:
data_cleaned = data.drop(['title','special_features'],axis=1)
x = data_cleaned.drop('rented_last_month',axis=1)
y = data_cleaned['rented_last_month']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)


# Evaluate the Results

#Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}%")

# Confution matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confution matrix")
print(conf_matrix)

#Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))




Accuracy: 1.00%
Confution matrix
[[ 11   0]
 [  0 189]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        11
         1.0       1.00      1.00      1.00       189

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [16]:
print(data.shape)
print(data.head())

(1000, 28)
   film_id                title  release_year  rental_duration  rental_rate  \
0       19         AMADEUS HOLY          2006         0.719374    -1.209308   
1       21      AMERICAN CIRCUS          2006        -1.406855     1.221461   
2       29   ANTITRUST TOMATOES          2006         0.010631     0.006077   
3       38        ARK RIDGEMONT          2006         0.719374    -1.209308   
4       56  BAREFOOT MANCHURIAN          2006         0.719374     0.006077   

     length                                   special_features  total_rentals  \
0 -0.056229      Commentaries,Deleted Scenes,Behind the Scenes             21   
1  0.339751                     Commentaries,Behind the Scenes             22   
2  1.304951               Trailers,Commentaries,Deleted Scenes             10   
3 -1.169922  Trailers,Commentaries,Deleted Scenes,Behind th...              0   
4  0.339751                              Trailers,Commentaries             18   

   rented_last_month  categ

In [17]:
print(data['rented_last_month'].value_counts())

rented_last_month
1.0    958
0.0     42
Name: count, dtype: int64
