In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
#importing the Dataset
train_set = pd.read_csv('train.csv')

#importing the test Dataset
test_set = pd.read_csv('test.csv')

In [None]:
train_set

In [None]:
test_set.head(5)

In [None]:
#Checking to see if the columns names and position match each other
train_set.columns[:-1] == test_set.columns

### Accessing the Quality of the Data

In [None]:
train_set.columns

In [None]:
train_set.dtypes

### Description of the Features
- title - Title of the movie
- country - Countries in which movie was released
- genres - Movie Genres (Action ,Adventure, Comedy etc.)
- language - Languages in which movie was released
- writer_count - Number of writers of the movie
- title_adaption - Is movie original screenplay or adapted
- censor_rating - Release rating given to the movie (R /PG-13/PG/NR/UR/G) 
- release_date - Date when movie was released
- runtime - Movie runtime
- dvd_release_date - Date of release of DVD for sale
- users_votes - Number of users who voted for this movie to be included in Watch-It library
- comments - Number of comments on movie trailer
- likes - Number of likes on movie trailer
- overall_views - Number of views on movie trailer
- dislikes - Number of dislikes on movie trailer
- ratings_imdb - Rating given to movie on IMDB
- ratings_tomatoes - Rating given to movie on Rotten tomatoes.
- ratings_metacritic - Rating given to movie on Metacritic etc.
- special_award - Number of awards nominations/winnings in BAFTA, Oscar or Golden Globe.
- awards_win - Awards won by the movie
- awards_nomination - Number of awards nominations
- revenue_category - Revenue Category (High/Low)























In [None]:
#Checking for null values
train_set.isna().sum()

In [None]:
#Visualizing the null values present
plt.figure(figsize=(10,6))
sns.displot(
    data=train_set.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=1.25)

In [None]:
#Checking the number of distinct values for each column
train_set.nunique()

#Let us keep in mind that there are 2400 rows

In [None]:
#Assessng the movie title column, it was expected that the movie title column will be unique, since it is not,
#further investigation needs to be carried out
train_set['title'].value_counts()

In [None]:
#checking for duplicates value
train_set[train_set.duplicated()]

No Duplicates were found, upon further analysis, it was noticed that the names of the movies were indeed unique

In [None]:
train_set[train_set['title'] == 'Frozen ']

In [None]:
#assessing the country column 
train_set['country'].value_counts()

#identified as a categorical variable
#should be useful

In [None]:
#assessing the genres column
train_set['genres'].value_counts()

#identified as a categorical variable
#should be useful

In [None]:
#assessing the language column
train_set['language'].value_counts()[:15]

#identified as a categorical variable
#should be useful

In [None]:
#Assessing the writer count
train_set['writer_count'].value_counts()

#identified as a categorical variable
#might be useful, will be compared with the revenue for further analysis

In [None]:
#Assessing the title adaption
train_set['title_adaption'].value_counts()

#Identified as a categorical variable
#Might be useful, it will be compared with the revenue for further analysis

In [None]:
#assessing the users vote column
train_set['users_votes'].value_counts()

#Too ambigious, should be dropped

In [None]:
#assessing the likes column
train_set['likes'].value_counts()

#Indentified as a continious variable
#should be used for further analyis

In [None]:
#assessing the runtime column
train_set['runtime'].value_counts()

#should be useful for analysis

In [None]:
#assessing the overall_views column
train_set['overall_views'].value_counts()

#Indentified as a continious variable
#should be used for further analyis

In [None]:
#assessing the dislikes colum
train_set['dislikes'].value_counts()

#identified as a continious variable
#should be useful

In [None]:
#assessing the ratings imdb column
train_set['ratings_imdb'].value_counts()

#identified as a continious variable
#should be useful

In [None]:
#assessing the ratings tomatoes column
train_set['ratings_tomatoes'].value_counts()

#identified as a continious variable
#should be useful

In [None]:
#assessing the ratings metacritic column
train_set['ratings_metacritic'].value_counts()

#identified as a continious variable
#should be useful

In [None]:
#assessing the censor rating column
train_set['censor_rating'].value_counts()

#identified as a categorical variable
#should be useful

#### Issues  - Should be writen in a function so that you can apply it to the test dataset

- Get Rid of Null Values based on important columns like the movie title
- Might need to stripe whitespaces from the records of certain columns
- Drop unnecessary columns like comments, users_votes, 
- Change the Date Column to Datetime: release_date, dvd_release_date 
- Create a new column titled 'early_dvd_release', this is the difference bewteen the cinema release date and dvd release date
- Clean the runtime column for proper analysis: get rid of the min, and convert to minutes
- Create a new column titled 'equivalent likes: likes - dislikes, this will be what will be used for analysis
- Get Rid of the /10 on the imdb ratings column and convert to numeric type
- Get Rid of the % on the ratings_tomatoes column and convert to numeric type
- Get Rid of the /100 on the metacritic ratings column and convert to numeric type
- Create a first language column
- Create a count of total number of languages available for the movie
- Create a first Genre column
- Create a count of total numbetr of genre for the movie



In [None]:
def clean_dataset(data):
    #drop null values
    key_values = ['title']
    data = data.dropna(subset = key_values)
    #changing the columns to datetime
    data['dvd_release_date'] = pd.to_datetime(data['dvd_release_date'], errors = 'coerce')
    data['release_date'] = pd.to_datetime(data['release_date'], errors = 'coerce')
    #creating the early_dvd_release column
    data['early_dvd_release'] = (data['dvd_release_date'] - data['release_date']).dt.days
    #creating a month column
    data['release_month'] = data['release_date'].dt.month
    #creating a year column
    data['release_year'] = data['release_date'].dt.year
    #cleaning the runtime column
    data['runtime'] = data['runtime'].str.strip(' min').apply(pd.to_numeric)
    #creating the equivalent likes column
    data['equivalent_likes'] = data['likes'] - data['dislikes']
    #cleaning the ratings_imdb column
    data['ratings_imdb'] = data['ratings_imdb'].str.strip('/10').apply(pd.to_numeric)
    #cleaning the ratings tomatoes column
    data['ratings_tomatoes'] = data['ratings_tomatoes'].str.strip('%').apply(pd.to_numeric)
    #cleaning the ratings_metacritic column
    data['ratings_metacritic'] = data['ratings_metacritic'].str.strip('/100').apply(pd.to_numeric)
    #cleaning the censor_rating column
    data['censor_rating'].replace(['Unrated', 'Not Rated'], ['UNRATED','NOT RATED'], inplace = True)
    #creating a column to get the first language
    data['first_language'] = data['language'].str.split(',').str[0]
    #creating a column to get the total number of languages present
    data['No_of_languages'] = data['language'].str.split(',').str.len()
    #creating a column to get the first genre
    data['main_genre'] = data['genres'].str.split(',').str[0]
    #creating a column to get the total number of genres listed
    data['No_of_genres'] = data['genres'].str.split(',').str.len()
    #creating a column to get the first country
    data['first_country'] = data['country'].str.split(',').str[0]
    #creating a column to get the total number of genres listed
    data['No_of_country'] = data['country'].str.split(',').str.len()
    #renaming the title_adaption column records
    data['title_adaption'].replace([True, False], [1,0], inplace = True)
    #renaming the revenue_category column records
    try:
        data['revenue_category'].replace(['Low', 'High'], [0,1], inplace = True)
    except:
        pass
    #cleaning the users_vote column
    data['users_votes'] = data['users_votes'].str.replace(',','').apply(pd.to_numeric)
    
    #drop all null values
    data = data.dropna(how = 'any')
    #turning categorical variables to numbers
    cat = pd.get_dummies(data['censor_rating'])
    
    #joining the dataframe
    data = pd.concat([data, cat], axis = 1)
    
    #drop unecessary columns
    cols_to_drop = ['comments', 'likes', 'dislikes', 'dvd_release_date', 'release_date', 'censor_rating',
                    'language', 'genres', 'country', 'first_country', 'first_language', 'main_genre']
    data = data.drop(cols_to_drop, axis = 1)
    
    return data
    
    

In [None]:
#cleaning the dataset
new_train_set = clean_dataset(train_set)

In [None]:
#importing machine learning modules
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
#creating d 
X = new_train_set.drop(['title','revenue_category'],axis = 1)
y = new_train_set['revenue_category']

In [None]:
#dividing the dataset into a train_set and test_set
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
#Using another metric
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#Visualizaing the Results
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

### Testing on the Original Test_set

Cleaning the Original Test with the clean_dataset function

In [None]:
new_test_set = clean_dataset(test_set)

In [None]:
#dropping unnecessary columns
Original_test= new_test_set.drop(['title'],axis = 1)

In [None]:
#Using the the machine learning model developed
results = logreg.predict(Original_test)

In [None]:
#printing results
results

In [None]:
# concatenating the results column to the dataframe
res = []

for index, element in enumerate(results):
    if element == 0:
        res.append("Low")
    else:
        res.append("High")

new_test_set['Revenue_category'] = res


In [None]:
results_df = new_test_set[['title', 'Revenue_category']]

In [None]:
results_df.to_csv('submissions.csv', index = False)