# 1. Paraprocesimi i të dhënave


## 1.1 Leximi i CSV-së

In [88]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [89]:
dataseti = pd.read_csv("netflix_titles.csv")

In [90]:
dataseti.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## 1.2 Definimi i atributeve

In [91]:
attr_obj = dataseti.shape
print(f"Numri i atributeve në dataset është: {attr_obj[1]}")
print(f"Numri i objekteve në dataset është: {attr_obj[0]}")

Numri i atributeve në dataset është: 12
Numri i objekteve në dataset është: 8807


In [92]:
x = dataseti.columns.tolist()

print('Atributet në dataset:')

for item in x:
    print(f'    - {item}')

Atributet në dataset:
    - show_id
    - type
    - title
    - director
    - cast
    - country
    - date_added
    - release_year
    - rating
    - duration
    - listed_in
    - description


## 1.3 Tipet e të dhënave

In [93]:
dataseti.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

## 1.4 Kualiteti i të dhënave

### 1.4.1 Identifikimi i vlerave të zbrazëta

In [94]:
def num_missing(x):
  return sum(x.isnull())

In [95]:
print("Numri i vlerave që mungojnë në çdo kolonë:")
print(dataseti.apply(num_missing, axis=0)) #axis=0 nenkupton qe funksionoi num_missing do te aplikohet ne cdo kolone

Numri i vlerave që mungojnë në çdo kolonë:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


### 1.4.2 Strategjia e trajtimit të vlerave të zbrazëta

In [96]:
#Largimi i rreshtave me vlera NaN
print('Numri i rreshtave në dataset-in origjinal = %d' % (dataseti.shape[0]))

dataseti = dataseti.dropna(axis=0, subset=['director'])
dataseti = dataseti.dropna(axis=0, subset=['cast'])
dataseti = dataseti.dropna(axis=0, subset=['country'])

print('Numri i rreshtave pas largimit të rreshtave me vlera NaN = %d' % (dataseti.shape[0]))
print("Dataseti pas largimit të vlerave të zbrazëta:")

dataseti.head()

Numri i rreshtave në dataset-in origjinal = 8807
Numri i rreshtave pas largimit të rreshtave me vlera NaN = 5336
Dataseti pas largimit të vlerave të zbrazëta:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic","September 23, 2021",2021,TV-MA,127 min,"Dramas, International Movies",After most of her family is murdered in a terr...
24,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,"September 21, 2021",1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...


In [97]:
print("Numri i vlerave që mungojnë në çdo kolonë pas ndryshimeve:")
print(dataseti.apply(num_missing, axis=0))

Numri i vlerave që mungojnë në çdo kolonë pas ndryshimeve:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          1
duration        3
listed_in       0
description     0
dtype: int64


In [98]:
def nullToModeValue(kolona, tipi):
    duration_column = dataseti[kolona]
    type_column =dataseti[tipi]
    duration_counter = Counter(duration_column)
    type_counter = Counter(type_column)
    
    duration_column_object = dict(duration_column)
    type_column_object = dict(type_column)
    
    for item in duration_column_object:
        for element in type_column_object:
            if item == element:
                if f'{duration_column_object[item]}' == 'nan':
                    print('Type:', type_column_object[element], f', {kolona}:', duration_column_object[item])
    
    print(f'\nNumri i vlerave null në kolonën {kolona} : {dataseti[kolona].isnull().sum()}')
    
    print("Vlera më së shumti e përsëritur:", duration_counter.most_common(1)[0][0])
    dataseti[kolona].fillna(duration_counter.most_common(1)[0][0], inplace=True)
    print(f'\nNumri i vlerave null pas plotësimit të vlerave NaN në kolonën {kolona} : {dataseti[kolona].isnull().sum()}')

In [99]:
nullToModeValue("duration", "type")

Type: Movie , duration: nan
Type: Movie , duration: nan
Type: Movie , duration: nan

Numri i vlerave null në kolonën duration : 3
Vlera më së shumti e përsëritur: 94 min

Numri i vlerave null pas plotësimit të vlerave NaN në kolonën duration : 0


In [100]:
nullToModeValue("rating", "type")

Type: Movie , rating: nan

Numri i vlerave null në kolonën rating : 1
Vlera më së shumti e përsëritur: TV-MA

Numri i vlerave null pas plotësimit të vlerave NaN në kolonën rating : 0


In [101]:
print("Numri i vlerave që mungojnë në cdo kolonë pas ndryshimeve finale:")
print(dataseti.apply(num_missing, axis=0))

Numri i vlerave që mungojnë në cdo kolonë pas ndryshimeve finale:
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [102]:
print('Gjatësia finale e dataset-it pas trajtimeve më sipër:', len(dataseti['release_year']))

Gjatësia finale e dataset-it pas trajtimeve më sipër: 5336


### 1.4.3 Pastrimi i vlerave të përsëritura

In [103]:
dups = dataseti.duplicated()
print('Numri i rreshtave të përsëritur = %d' % (dups.sum()))

Numri i rreshtave të përsëritur = 0


In [104]:
#Largimi i rreshtave te perseritur

#print('Numri i rreshtave para largimit te duplikateve = %d' % (dataseti.shape[0]))
#dataseti = dataseti.drop_duplicates()
#print('Numri i rreshtave pas largimit te duplikateve = %d' % (dataseti.shape[0]))

## 1.5 Pastrimi i vlerave jo valide

In [105]:
print(f"\nGrupimi i elementeve sipas atributit 'type':")
dataseti.groupby(['type'])['type'].count()


Grupimi i elementeve sipas atributit 'type':


type
Movie      5189
TV Show     147
Name: type, dtype: int64

Nga komanda e mësipërme vërejmë që për atributin 'type' nuk kemi vlera jo valide.

In [106]:
print(f"\nGrupimi i elementeve sipas atributit 'release_year':")
dataseti.groupby(['release_year'])['release_year'].count()


Grupimi i elementeve sipas atributit 'release_year':


release_year
1942      1
1944      1
1945      1
1946      1
1947      1
       ... 
2017    658
2018    648
2019    519
2020    442
2021    161
Name: release_year, Length: 72, dtype: int64

Nga komanda e mësipërme vërejmë që për atributin 'release_year' nuk kemi vlera jo valide.

In [107]:
print(f"\nGrupimi i elementeve sipas atributit 'rating':")
dataseti.groupby(['rating'])['rating'].count()


Grupimi i elementeve sipas atributit 'rating':


rating
66 min         1
74 min         1
84 min         1
G             40
NC-17          2
NR            58
PG           275
PG-13        470
R            778
TV-14       1214
TV-G          84
TV-MA       1823
TV-PG        431
TV-Y          76
TV-Y7         76
TV-Y7-FV       3
UR             3
Name: rating, dtype: int64

Nga komanda e mësipërme vërejmë që për atributin 'rating' kemi 3 vlera jo valide, të cilat do t'i trajtojmë në vijim.

In [108]:
#update fields to NaN
dataseti.loc[(dataseti.rating == '66 min') | (dataseti.rating == '74 min') | (dataseti.rating == '84 min'), 'rating'] = 'nan'

In [109]:
print(f"\nGrupimi i elementeve sipas atributit 'rating' pas UPDATE:")
dataseti.groupby(['rating'])['rating'].count()


Grupimi i elementeve sipas atributit 'rating' pas UPDATE:


rating
G             40
NC-17          2
NR            58
PG           275
PG-13        470
R            778
TV-14       1214
TV-G          84
TV-MA       1823
TV-PG        431
TV-Y          76
TV-Y7         76
TV-Y7-FV       3
UR             3
nan            3
Name: rating, dtype: int64

In [110]:
#ku kemi stringun nan konvertoje ne nan type
dataseti.loc[dataseti.rating == 'nan', 'rating'] = np.nan

In [111]:
dataseti["rating"].value_counts(dropna=False)

TV-MA       1823
TV-14       1214
R            778
PG-13        470
TV-PG        431
PG           275
TV-G          84
TV-Y7         76
TV-Y          76
NR            58
G             40
NaN            3
TV-Y7-FV       3
UR             3
NC-17          2
Name: rating, dtype: int64

In [112]:
dataseti["rating"].value_counts()

TV-MA       1823
TV-14       1214
R            778
PG-13        470
TV-PG        431
PG           275
TV-G          84
TV-Y7         76
TV-Y          76
NR            58
G             40
TV-Y7-FV       3
UR             3
NC-17          2
Name: rating, dtype: int64

Atributi rating u pastruara nga vlerat jo valide.

In [113]:
print(f"\nGrupimi i elementeve sipas atributit 'duration':")
dataseti.groupby(['duration'])['duration'].count()


Grupimi i elementeve sipas atributit 'duration':


duration
1 Season    106
100 min      90
101 min     104
102 min     112
103 min     101
           ... 
95 min      127
96 min      109
97 min      129
98 min      109
99 min      106
Name: duration, Length: 198, dtype: int64

In [114]:
print(f"\nGrupimi i elementeve sipas atributit 'duration':")
dict(dataseti.groupby(['duration'])['duration'].count())


Grupimi i elementeve sipas atributit 'duration':


{'1 Season': 106,
 '100 min': 90,
 '101 min': 104,
 '102 min': 112,
 '103 min': 101,
 '104 min': 99,
 '105 min': 90,
 '106 min': 99,
 '107 min': 86,
 '108 min': 77,
 '109 min': 66,
 '110 min': 85,
 '111 min': 61,
 '112 min': 72,
 '113 min': 62,
 '114 min': 49,
 '115 min': 57,
 '116 min': 76,
 '117 min': 59,
 '118 min': 62,
 '119 min': 61,
 '12 min': 2,
 '120 min': 51,
 '121 min': 52,
 '122 min': 39,
 '123 min': 41,
 '124 min': 51,
 '125 min': 34,
 '126 min': 42,
 '127 min': 47,
 '128 min': 41,
 '129 min': 32,
 '13 min': 1,
 '130 min': 40,
 '131 min': 31,
 '132 min': 36,
 '133 min': 39,
 '134 min': 22,
 '135 min': 35,
 '136 min': 21,
 '137 min': 36,
 '138 min': 14,
 '139 min': 20,
 '14 min': 1,
 '140 min': 23,
 '141 min': 17,
 '142 min': 13,
 '143 min': 23,
 '144 min': 8,
 '145 min': 16,
 '146 min': 11,
 '147 min': 11,
 '148 min': 19,
 '149 min': 15,
 '15 Seasons': 1,
 '15 min': 2,
 '150 min': 16,
 '151 min': 15,
 '152 min': 5,
 '153 min': 11,
 '154 min': 12,
 '155 min': 10,
 '156 min':

Nga komanda e mësipërme vërejmë që për atributin 'duration' nuk kemi vlera jo valide.

## 1.6 Agregimi

### 1.6.1 Agregimi përmes atributit "rating"

In [115]:
rating_agg = dataseti.groupby(['rating'])

In [116]:
rating_agg.describe()

Unnamed: 0_level_0,release_year,release_year,release_year,release_year,release_year,release_year,release_year,release_year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
G,40.0,1997.25,18.327155,1956.0,1988.25,2002.5,2009.5,2019.0
NC-17,2.0,2013.5,0.707107,2013.0,2013.25,2013.5,2013.75,2014.0
NR,58.0,2009.948276,11.840197,1958.0,2011.25,2015.0,2016.0,2018.0
PG,275.0,2008.127273,11.753326,1973.0,2004.0,2011.0,2017.0,2021.0
PG-13,470.0,2009.178723,8.945343,1955.0,2005.0,2011.0,2016.0,2021.0
R,778.0,2010.341902,10.003413,1962.0,2007.0,2014.0,2017.0,2021.0
TV-14,1214.0,2012.218287,10.630927,1942.0,2011.0,2016.0,2018.0,2021.0
TV-G,84.0,2015.297619,9.063072,1954.0,2015.0,2017.0,2019.25,2021.0
TV-MA,1823.0,2015.908941,6.026692,1963.0,2016.0,2017.0,2019.0,2021.0
TV-PG,431.0,2012.106729,11.644251,1946.0,2011.0,2016.0,2018.0,2021.0


In [117]:
rating_agg.agg('min')

Unnamed: 0_level_0,show_id,type,title,director,cast,country,date_added,release_year,duration,listed_in,description
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G,s108,Movie,A Champion Heart,Ben Wallis,"Anika Noni Rose, Bruno Campos, Keith David, Mi...",Canada,"April 1, 2018",1956,100 min,"Action & Adventure, Classic Movies, Sci-Fi & F...",A pilot challenges Balto's son Kodi and his sl...
NC-17,s5265,Movie,Blue Is the Warmest Color,Abdellatif Kechiche,"Léa Seydoux, Adèle Exarchopoulos, Salim Kechio...",Canada,"August 26, 2016",2013,112 min,"Comedies, International Movies","After losing a TV network deal, the Trailer Pa..."
NR,s5988,Movie,13 Cameras,"Aaron Nee, Adam Nee","Addison Timlin, Ally Sheedy, Keith Poulson, Pe...",Argentina,"April 1, 2016",1958,101 min,"Action & Adventure, Comedies, Independent Movies","""Last Comic Standing"" winner Iliza Shlesinger ..."
PG,s1020,Movie,A 2nd Chance,Aamir Khan,"Aamir Khan, Darsheel Safary, Tanay Chheda, Tis...","Argentina, Spain","April 1, 2018",1973,100 min,"Action & Adventure, Anime Features, Children &...",A 10-year-old dreamer's imaginary friends – mi...
PG-13,s10,Movie,"10,000 B.C.",Adam McKay,"Aamir Khan, Kareena Kapoor, Madhavan, Sharman ...",Australia,"April 1, 2016",1955,100 min,Action & Adventure,A 9-year-old boy finds his life upended when h...
R,s1014,Movie,13 Sins,Aaron Sorkin,"50 Cent, Ryan Phillippe, Bruce Willis, Rory Ma...",Argentina,"April 1, 2018",1962,100 min,Action & Adventure,"""Friends with benefits"" Emma and Adam are fine..."
TV-14,s1002,Movie,#AnneFrank - Parallel Stories,A. L. Vijay,"Aadhi, Tapsee Pannu, Ritika Singh, Vennela Kis...",", France, Algeria","March 31, 2018",1942,1 Season,Action & Adventure,"""Queen of Sleaze"" or feminist pioneer? Powerho..."
TV-G,s1190,Movie,#FriendButMarried,Alastair Fothergill,"Adipati Dolken, Mawar de Jongh, Sari Nila, Von...",Australia,"April 1, 2020",1954,1 Season,"British TV Shows, Docuseries, International TV...",A by-the-book political aide falls for a big-h...
TV-MA,s1000,Movie,#Alive,Aadish Keluskar,"Aamina Sheikh, Sanam Saeed, Adnan Malik, Moham...",Argentina,"April 1, 2017",1963,1 Season,Action & Adventure,"""Brooklyn Nine-Nine"" star Chelsea Peretti show..."
TV-PG,s1043,Movie,'76,"Abbas Alibhai Burmawalla, Mastan Alibhai Burma...","Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh,...",Argentina,"August 4, 2017",1946,1 Season,"Action & Adventure, Anime Features, Children &...",A London writer bonds with the colorful reside...


In [118]:
rating_agg.agg('max')

Unnamed: 0_level_0,show_id,type,title,director,cast,country,date_added,release_year,duration,listed_in,description
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G,s934,Movie,Willy Wonka & the Chocolate Factory,"Vincente Minnelli, Charles Walters","Zach Braff, Joan Cusack, Patrick Stewart, Stev...","United States, Spain","September 4, 2021",2019,99 min,"Documentaries, Music & Musicals",Zany Willy Wonka causes a stir when he announc...
NC-17,s6339,Movie,Swearnet: The Movie,Warren P. Sonoda,"Mike Smith, John Paul Tremblay, Robb Wells, Pa...","France, Belgium, Spain","September 24, 2017",2014,180 min,"Dramas, Independent Movies, International Movies","Determined to fall in love, 15-year-old Adele ..."
NR,s8793,Movie,Young Tiger,Zatella Beatty,Zachary Levi,"United States, India, Bangladesh","September 29, 2018",2018,96 min,Stand-Up Comedy,Young parents-to-be Claire and Ryan move into ...
PG,s95,Movie,Zoom,Zack Snyder,"YaYa Gosselin, Pedro Pascal, Priyanka Chopra, ...","United States, United Kingdom, Canada","September 8, 2021",2021,99 min,"Dramas, Sports Movies",Zany misadventures are in store as lovable cit...
PG-13,s971,Movie,Æon Flux,"Àlex Pastor, David Pastor","Zoey Deutch, Lucy Fry, Danila Kozlovsky, Gabri...","United States, United Kingdom, Spain, South Korea","September 9, 2020",2021,99 min,Thrillers,Young Americans visit Moscow when the city is ...
R,s972,Movie,Zoot Suit,Zoe Lister-Jones,"Zoe Lister-Jones, Adam Pally, Fred Armisen, Su...","United States, United Kingdom, Italy","September 8, 2020",2021,99 min,Thrillers,Zack and Miri make and star in an adult film t...
TV-14,s995,TV Show,​​Kuch Bheege Alfaaz,Şenol Sönmez,"Ṣọpẹ́ Dìrísù, Wunmi Mosaku, Matt Smith, Malaik...",Vietnam,"September 9, 2020",2021,99 min,Thrillers,"Zixin is about to marry Qihong, but her galliv..."
TV-G,s992,TV Show,You Can Tutu,Yandy Laurens,Woody Harrelson,Zimbabwe,"September 22, 2020",2021,97 min,Stand-Up Comedy,Young orphan Nicolas learns how to guide a sle...
TV-MA,s997,TV Show,반드시 잡는다,Ömer Faruk Sorak,"Şahin Irmak, İrem Sak, Gonca Vuslateri, Emre K...",Vietnam,"September 9, 2021",2021,99 min,Thrillers,Yılmaz Erdoğan's lauded stage play traces the ...
TV-PG,s974,TV Show,Zipi & Zape y la Isla del Capitan,Yılmaz Erdoğan,Zion Clark,Uruguay,"September 9, 2019",2021,99 min,"TV Comedies, TV Dramas",Young orphan Kuttappayi goes to live with his ...


### 1.6.2 Agregimi përmes atributit "type"

In [119]:
type_agg = dataseti.groupby(['type'])

In [120]:
type_agg.describe()

Unnamed: 0_level_0,release_year,release_year,release_year,release_year,release_year,release_year,release_year,release_year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Movie,5189.0,2012.614569,9.70336,1942.0,2011.0,2016.0,2018.0,2021.0
TV Show,147.0,2017.285714,4.047661,1990.0,2016.0,2018.0,2020.0,2021.0


In [121]:
type_agg.agg({'release_year': 'max'})

Unnamed: 0_level_0,release_year
type,Unnamed: 1_level_1
Movie,2021
TV Show,2021


In [122]:
type_agg.agg({'release_year': 'min'})

Unnamed: 0_level_0,release_year
type,Unnamed: 1_level_1
Movie,1942
TV Show,1990


## 1.7 Mostrimi

In [123]:
dataseti.sample(n=2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
325,s326,Movie,Beethoven,Brian Levant,"Charles Grodin, Bonnie Hunt, Dean Jones, Olive...",United States,"August 1, 2021",1992,PG,87 min,"Children & Family Movies, Comedies",A father reluctantly agrees to let his childre...
6667,s6668,Movie,Eh Janam Tumhare Lekhe,Harjit Singh,"Pavan Malhotra, Sudhanshu Aggarwal, Arjuna Bha...",India,"December 1, 2017",2015,TV-14,124 min,"Dramas, International Movies",Driven by the lessons he learned from his moth...


In [124]:
dataseti.sample(frac=0.001)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
6006,s6007,Movie,28 Days,Betty Thomas,"Sandra Bullock, Viggo Mortensen, Dominic West,...",United States,"September 30, 2020",2000,PG-13,104 min,"Comedies, Dramas",After her drunken antics result in property da...
8552,s8553,Movie,The Water Diviner,Russell Crowe,"Russell Crowe, Olga Kurylenko, Yılmaz Erdoğan,...","Australia, United States","October 7, 2019",2014,R,111 min,"Dramas, International Movies",Years after the presumed death of his three so...
2391,s2392,Movie,Milea,"Fajar Bustomi, Pidi Baiq","Iqbaal Ramadhan, Vanesha Prescilla, Ira Wibowo...",Indonesia,"June 13, 2020",2020,TV-14,100 min,"Dramas, International Movies, Romantic Movies","Years after his teen romance with Milea, a now..."
2353,s2354,Movie,Chaman Bahaar,Apurva Dhar Badgaiyann,"Jitendra Kumar, Ritika Badiani, Yogendra Tikku...",India,"June 19, 2020",2020,TV-MA,112 min,"Comedies, Dramas, International Movies",A local shop becomes a hub for young men taken...
5031,s5032,Movie,Forgotten,Hang-Jun Jang,"Ha-neul Kang, Moo-Yul Kim, Young-hee Na, Seong...",South Korea,"February 21, 2018",2017,TV-MA,109 min,"Dramas, International Movies, Thrillers",When his abducted brother returns seemingly a ...


In [128]:
dataseti.sample(n=2, axis=1)

Unnamed: 0,title,duration
7,Sankofa,125 min
8,The Great British Baking Show,9 Seasons
9,The Starling,104 min
12,Je Suis Karl,127 min
24,Jeans,166 min
...,...,...
8801,Zinzana,96 min
8802,Zodiac,158 min
8804,Zombieland,88 min
8805,Zoom,88 min


In [133]:
dataseti['rating'].sample(n=3)

7216    TV-14
990     TV-MA
6134        R
Name: rating, dtype: object

In [134]:
dataseti['rating'].sample(n=3, random_state=1)

3022    TV-14
4438    TV-MA
8662    TV-14
Name: rating, dtype: object

In [136]:
dataseti.sample(3, axis=1).head()

Unnamed: 0,rating,country,title
7,TV-MA,"United States, Ghana, Burkina Faso, United Kin...",Sankofa
8,TV-14,United Kingdom,The Great British Baking Show
9,PG-13,United States,The Starling
12,TV-MA,"Germany, Czech Republic",Je Suis Karl
24,TV-14,India,Jeans


In [137]:
col = 'type'
for typ in list(dataseti[col].dropna().unique()):
    print(typ, end=' - ')
    display(dataseti[dataseti[col] == typ].sample(3))

Movie - 

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1041,s1042,Movie,GANTZ:O,"Keiichi Sato, Yasushi Kawamura","Daisuke Ono, M・A・O, Tomohiro Kaku, Saori Hayam...",Japan,"April 15, 2021",2016,TV-MA,96 min,"Action & Adventure, Anime Features, Horror Movies",Teams of recently deceased people who've been ...
2957,s2958,Movie,Hum Aapke Hain Koun,Sooraj R. Barjatya,"Madhuri Dixit, Salman Khan, Mohnish Bahl, Renu...",India,"February 1, 2020",1994,TV-14,193 min,"Classic Movies, Dramas, International Movies","Although Nisha falls for Prem, she agrees to m..."
635,s636,Movie,Here Comes the Rain,Bahij Hojeij,"Hassan Mrad, Julia Kassar, Carmen Lebbos, Diam...",Lebanon,"June 25, 2021",2010,TV-MA,101 min,"Dramas, International Movies",Abducted during the Lebanese Civil War and now...


TV Show - 

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
3423,s3424,TV Show,Black Money Love,Ahmet Katıksız,"Güler Ökten, Hazal Türesan, İlkin Tüfekçi, Bed...",Turkey,"October 15, 2019",2014,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas",After a cop's fiancée and a jewelry designer's...
723,s724,TV Show,The American Bible Challenge,Michael Simon,Jeff Foxworthy,United States,"June 15, 2021",2014,TV-G,1 Season,Reality TV,Join host Jeff Foxworthy as contestants test t...
6408,s6409,TV Show,Camelia la Texana,Carlos Bolado,"Sara Maldonado, Erik Hayser, Andrés Palacios, ...","Mexico, United States","February 1, 2017",2014,TV-14,1 Season,"Crime TV Shows, Romantic TV Shows, Spanish-Lan...","Inspired by the famous song ""Contrabando y Tra..."


In [138]:
col = 'type'
sample = []

variants = list(dataseti[col].dropna().unique())
print(variants)

for typ in variants:
    sample.append(dataseti[dataseti[col] == typ].sample())
pd.concat(sample)


['Movie', 'TV Show']


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
747,s748,Movie,Confusion Na Wa,Kenneth Gyang,"Ramsey Nouah, OC Ukeje, Ali Nuhu, Tunde Alades...",Nigeria,"June 9, 2021",2013,TV-MA,106 min,"Comedies, Dramas, International Movies",A misplaced cell phone's incriminating content...
2405,s2406,TV Show,DC's Legends of Tomorrow,Rob Seidenglanz,"Victor Garber, Brandon Routh, Caity Lotz, Fran...",United States,"June 10, 2020",2020,TV-14,5 Seasons,"TV Action & Adventure, TV Sci-Fi & Fantasy","A mysterious ""time master"" from the future uni..."


In [139]:
dataseti.groupby('type').apply(lambda x: x.sample(n=3))

Unnamed: 0_level_0,Unnamed: 1_level_0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Movie,8692,s8693,Movie,Walk with Me,"Marc Francis, Max Pugh","Benedict Cumberbatch, Thich Nhat Hanh",United Kingdom,"December 26, 2017",2017,TV-PG,94 min,Documentaries,A community of monks in France led by Zen Budd...
Movie,6551,s6552,Movie,Daddy Issues,Amara Cash,"Madison Lawlor, Montana Manning, Andrew Pifko,...",United States,"September 20, 2019",2019,TV-MA,82 min,"Dramas, Independent Movies, LGBTQ Movies",Maya finally hooks up with her online dream gi...
Movie,4803,s4804,Movie,Bill Burr: You People Are All the Same,Jay Karas,Bill Burr,United States,"June 30, 2018",2012,TV-MA,69 min,Stand-Up Comedy,Funnyman Bill Burr takes the stage to uncork a...
TV Show,1715,s1716,TV Show,A Queen Is Born,Carla Barros,"Gloria Groove, Alexia Twister",Brazil,"November 11, 2020",2020,TV-14,1 Season,"International TV Shows, Reality TV",Gloria Groove and Alexia Twister make drag dre...
TV Show,4174,s4175,TV Show,Innocent,Seren Yüce,"Ali Atay, Haluk Bilginer, Nur Sürer, Okan Yala...",Turkey,"January 23, 2019",2017,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas","In a peaceful, rustic town, a retired officer ..."
TV Show,6408,s6409,TV Show,Camelia la Texana,Carlos Bolado,"Sara Maldonado, Erik Hayser, Andrés Palacios, ...","Mexico, United States","February 1, 2017",2014,TV-14,1 Season,"Crime TV Shows, Romantic TV Shows, Spanish-Lan...","Inspired by the famous song ""Contrabando y Tra..."


In [141]:
# kombinon dy rreshtat e pare dhe dy te fundit
rows = 2
dataseti.head(rows).append(dataseti.tail(rows))

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...


## 1.8 Diskretizimi dhe Binarizimi

### 1.8.1 Diskretizimi

In [142]:
release_decade = ['1941-1950', '1951-1960', '1961-1970', '1971-1980', '1981-1990', '1991-2000', '2001-2010', '2011-2020', '2021-2030']
s1 = pd.cut(x=dataseti.release_year, bins=9, labels=release_decade)

In [85]:
s1

7       1991-2000
8       2021-2030
9       2021-2030
12      2021-2030
24      2001-2010
          ...    
8801    2021-2030
8802    2011-2020
8804    2011-2020
8805    2011-2020
8806    2021-2030
Name: release_year, Length: 5336, dtype: category
Categories (9, object): ['1941-1950' < '1951-1960' < '1961-1970' < '1971-1980' ... '1991-2000' < '2001-2010' < '2011-2020' < '2021-2030']

### 1.8.2 Binarizimi

In [143]:
rating_binarization = pd.get_dummies(dataseti['rating'])
rating_binarization

Unnamed: 0,G,NC-17,NR,PG,PG-13,R,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,TV-Y7-FV,UR
7,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,1,0,0,0,0,0
24,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8801,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8802,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8804,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8805,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [144]:
release_year_binarization = pd.get_dummies(dataseti['release_year'])
release_year_binarization

Unnamed: 0,1942,1944,1945,1946,1947,1954,1955,1956,1958,1959,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8805,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
type_binarization = pd.get_dummies(dataseti['type'])
type_binarization

Unnamed: 0,Movie,TV Show
7,1,0
8,0,1
9,1,0
12,1,0
24,1,0
...,...,...
8801,1,0
8802,1,0
8804,1,0
8805,1,0


## 1.9 Transformimi

In [146]:
rslt_df = dataseti.loc[dataseti['type'] == 'Movie']
rslt_df.groupby('release_year')['release_year'].transform(len)

7        23
9       146
12      146
24       30
27      140
       ... 
8801    342
8802     71
8804    112
8805     80
8806    342
Name: release_year, Length: 5189, dtype: int64

In [147]:
def retreive_mins(value):
    minutes = value.split(' ')[0]
    oret = int(minutes) // 60
    minutat = int(minutes) % 60
    
    if oret == 0:
        ora = f'{minutes} minutes'
    elif oret == 1:
        ora = f'{oret} hour and {minutat} minutes'
    else:
        ora = f'{oret} hours and {minutat} minutes'
    return str(ora)

In [148]:
retreive_mins('101 minutes')

'1 hour and 41 minutes'

In [149]:
rslt_df.groupby('duration')['duration'].transform(lambda x : retreive_mins(str(x)))[400:500]

931                 44 minutes
932                 46 minutes
933                 57 minutes
934     2 hours and 31 minutes
935                 38 minutes
                 ...          
1107    2 hours and 19 minutes
1108                44 minutes
1109                43 minutes
1110    2 hours and 15 minutes
1111                43 minutes
Name: duration, Length: 100, dtype: object

## 1.10 Reduktimi i Dimensionit

Reduktimi i dimensionit SVD (Singular Value Decomposition)

In [151]:
ds1 = pd.DataFrame(dataseti['release_year'])

In [152]:
u, s, vh = np.linalg.svd(ds1, full_matrices=False)

In [154]:
pd.DataFrame((u * s) @ vh, columns=ds1.columns).round(2)

Unnamed: 0,release_year
0,1993.0
1,2021.0
2,2021.0
3,2021.0
4,1998.0
...,...
5331,2015.0
5332,2007.0
5333,2009.0
5334,2006.0


Reduktimi i dimensionit duke përdorur PCA ose SVD është i papërshtatshëm për datasetin tonë për shkak të variancës së vogël të atributeve dhe pasi që kemi vetëm atributin 'release_year' me vlerë numerike (int).

## 1.11 Krijimi i vetive

In [155]:
dataseti.insert(12, 'year_added', int())
dataseti.insert(13, 'month_added', int())

In [156]:
months = {"January":1,
          "February":2,
          "March":3,
          "April":4,
          "May":5,
          "June":6,
          "July":7,
          "August":8,
          "September":9,
          "October":10,
          "November":11,
          "December":12}

for k, row in dataseti.iterrows():
    year = row['date_added'].split()[2]
    month = months[row['date_added'].split()[0]]
    dataseti.loc[k, "year_added"] = year
    dataseti.loc[k, "month_added"] = month 

print('Krijimi i vetive u përfundua me sukses!')

Krijimi i vetive u përfundua me sukses!


In [157]:
dataseti_by_year = dataseti[["type","year_added"]]
dataseti_by_year.groupby(["type","year_added"])['year_added'].count()

type     year_added
Movie    2008             1
         2009             2
         2010             1
         2011            13
         2012             3
         2013             6
         2014            14
         2015            47
         2016           197
         2017           704
         2018          1085
         2019          1236
         2020          1151
         2021           729
TV Show  2013             1
         2015             3
         2016             7
         2017            22
         2018            16
         2019            29
         2020            43
         2021            26
Name: year_added, dtype: int64

# 1.11 Zgjedhja e nënbashkësisë së vetive

Zgjedhja e nënbashkësive përmes Chi-Squared statistical test duke përdorur SelectKBest klasën.

In [158]:
arrayy = dataseti.values
X = arrayy[:,12:14]
Y = arrayy[:,12]

In [159]:
Y

array(['2021', '2021', '2021', ..., '2019', '2020', '2019'], dtype=object)

In [160]:
X

array([['2021', 9],
       ['2021', 9],
       ['2021', 9],
       ...,
       ['2019', 11],
       ['2020', 1],
       ['2019', 3]], dtype=object)

In [161]:
test = SelectKBest(score_func=chi2, k=2)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features[0:5,:])

[  6.277 450.815]
[['2021' 9]
 ['2021' 9]
 ['2021' 9]
 ['2021' 9]
 ['2021' 9]]


Zgjedhja e nënbashkësive përmes Recursive Feature Elemination duke përdorur LogisticRegression klasën.

In [162]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)
rfe = RFE(model, 1)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 1
Selected Features: [False  True]
Feature Ranking: [2 1]


Zgjedhja e nënbashkësive përmes Ridge Regression duke përdorur Ridge klasën.

In [163]:
ridge = Ridge(alpha=1.0)
ridge.fit(X,Y)

Ridge()

In [164]:
def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name) for coef, name in lst)

In [165]:
print("Ridge model:", pretty_print_coefs(ridge.coef_))

Ridge model: 1.0 * X0 + -0.0 * X1
