In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier

#univariate feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression

#mode selection
from sklearn import tree
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics

class MLResearchScikitLearn:

    def __init__(self, dataset):
        self.dataset = dataset
        self.nominalfeatures = ['color','director_name','actor_2_name','actor_1_name','actor_3_name','plot_keywords','language','country','content_rating','genres','movie_title','movie_imdb_link']
        self.integerfeatures = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','facenumber_in_poster','num_user_for_reviews','budget','title_year','actor_2_facebook_likes','aspect_ratio']

    def imputation(self):

        #apply imputer with strategy=most_frequent for missing values in string type feature column
        imputer_string = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
        imputer_string.fit(self.dataset[self.nominalfeatures])
        self.dataset[self.nominalfeatures] = imputer_string.transform(self.dataset[self.nominalfeatures])
        
        #apply imputer with strategy=mean for missing values in integer type feature column
        imputer_integer = SimpleImputer(missing_values=np.NaN, strategy="mean")
        imputer_integer.fit(self.dataset[self.integerfeatures])
        self.dataset[self.integerfeatures] = imputer_integer.transform(self.dataset[self.integerfeatures])

    def featureEncoding(self):
        enc= OrdinalEncoder()
        self.dataset[self.nominalfeatures] = enc.fit_transform(self.dataset[self.nominalfeatures])

    def convertFeatureToBinary(self):
        median = self.dataset["imdb_score"].median()
        for i in range (self.dataset.shape[0]):
            if self.dataset["imdb_score"][i] >= median:
                self.dataset["imdb_score"][i] = 1
            elif self.dataset["imdb_score"][i] < median:
                self.dataset["imdb_score"][i] = 0

    def checkOutliers(self):
        sns.boxplot(data=pd.DataFrame(self.dataset["imdb_score"]))
        plt.show()

    def handleImbalance(self):
        targets = self.dataset["imdb_score"].value_counts()
        #print (targets)
        print ("Minority class represents just ",(targets[1]/len(self.dataset["imdb_score"]))*100, " % of the dataset") 

    def univariateFeatureSelection(self):
        X = self.dataset.loc[:, self.dataset.columns != 'imdb_score']
        y = self.dataset["imdb_score"]
        selector = SelectPercentile(f_regression, percentile=25)
        selector.fit(X,y)
        for s in selector.scores_:
            print ("Score : ", s)

    def treeBaseFeatureSelection(self):
        X = self.dataset.loc[:, self.dataset.columns != 'imdb_score']
        y = self.dataset["imdb_score"]
        # Build a forest and compute the feature importance
        forest = RandomForestClassifier(n_estimators=250, random_state=0)
        forest.fit(X, y)
        importances = forest.feature_importances_
        for index in range(X.shape[1]):
            #print(index)
            print ("Importance of feature ", index, "is", importances[index])

    def kFoldCrossValidation(self):
        allResults= []

        split_ds = train_test_split(self.dataset, test_size=int(self.dataset.shape[0]*20/100), random_state = 1)

        train = split_ds[0]
        test = split_ds[1]

        train_data = train.loc[:, train.columns != 'imdb_score']
        train_target = train["imdb_score"]        

        test_data = test.loc[:, test.columns != 'imdb_score']
        test_target = test["imdb_score"] 


        kf= model_selection.KFold(n_splits=6, shuffle=True, random_state=1)
        
        print("train data  is >>>>>>:")
        print(train_data)

        for train_index, test_index in kf.split(train_data):
            clf= tree.DecisionTreeClassifier()
            clf.fit(train_data.loc[train_index], train_target.loc[train_index])

            results= clf.predict(train_data.loc[test_index])

            allResults.append(metrics.accuracy_score(results, train_target.loc[test_index]))

        print ("Accuracy is ", np.mean(allResults))

dataset = pd.read_csv("movie_metadata.csv")
print("initial dataset", dataset)
classobj = MLResearchScikitLearn(dataset)
classobj.imputation()
classobj.featureEncoding()
classobj.convertFeatureToBinary()
#classobj.handleImbalance()
#classobj.univariateFeatureSelection()
#classobj.checkOutliers()
classobj.kFoldCrossValidation()

#print("nan:>>>>",dataset.isna().any())


initial dataset       color      director_name  num_critic_for_reviews  duration  \
0     Color      James Cameron                   723.0     178.0   
1     Color     Gore Verbinski                   302.0     169.0   
2     Color         Sam Mendes                   602.0     148.0   
3     Color  Christopher Nolan                   813.0     164.0   
4       NaN        Doug Walker                     NaN       NaN   
...     ...                ...                     ...       ...   
5038  Color        Scott Smith                     1.0      87.0   
5039  Color                NaN                    43.0      43.0   
5040  Color   Benjamin Roberds                    13.0      76.0   
5041  Color        Daniel Hsia                    14.0     100.0   
5042  Color           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


train data  is >>>>>>:
      color  director_name  num_critic_for_reviews  duration  \
1960    1.0          298.0                   233.0      93.0   
120     1.0          373.0                   478.0     128.0   
3638    1.0         2344.0                    56.0     104.0   
3926    1.0          997.0                    45.0      94.0   
2916    1.0         2363.0                   304.0     132.0   
...     ...            ...                     ...       ...   
2895    1.0          985.0                    62.0     100.0   
2763    1.0           15.0                    94.0     119.0   
905     1.0         2038.0                   391.0     123.0   
3980    1.0         1661.0                   112.0     109.0   
235     1.0         1202.0                   539.0     124.0   

      director_facebook_likes  actor_3_facebook_likes  actor_2_name  \
1960                     27.0                    49.0        1472.0   
120                   22000.0                 11000.0        1749.

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [4]:
print(dataset.iloc["color"])

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
5038    1.0
5039    1.0
5040    1.0
5041    1.0
5042    1.0
Name: color, Length: 5043, dtype: float64
