In [1]:
import pandas as pd
import numpy as np

In [2]:
imdb_data = pd.read_csv('imdb.csv')

imdb_data = imdb_data.drop(columns=['Rank', 'Description', 'Director', 'Actors'])
imdb_data = imdb_data.dropna()
imdb_data = imdb_data[imdb_data['Year'] < 2014]
imdb_data = imdb_data.reset_index(drop=True)
imdb_data = imdb_data.rename(columns={'Runtime (Minutes)': 'Runtime', 'Revenue (Millions)': 'Revenue'})
imdb_data

Unnamed: 0,Title,Genre,Year,Runtime,Rating,Votes,Revenue,Metascore
0,Prometheus,"Adventure,Mystery,Sci-Fi",2012,124,7.0,485820,126.46,65.0
1,Pirates of the Caribbean: On Stranger Tides,"Action,Adventure,Fantasy",2011,136,6.7,395025,241.06,45.0
2,The Dark Knight,"Action,Crime,Drama",2008,152,9.0,1791916,533.32,82.0
3,The Prestige,"Drama,Mystery,Sci-Fi",2006,130,8.5,913152,53.08,66.0
4,Pirates of the Caribbean: At World's End,"Action,Adventure,Fantasy",2007,169,7.1,498821,309.40,50.0
...,...,...,...,...,...,...,...,...
433,Taare Zameen Par,"Drama,Family,Music",2007,165,8.5,102697,1.20,42.0
434,Resident Evil: Afterlife,"Action,Adventure,Horror",2010,97,5.9,140900,60.13,37.0
435,Project X,Comedy,2012,88,6.7,164088,54.72,48.0
436,Hostel: Part II,Horror,2007,94,5.5,73152,17.54,46.0


In [3]:
def quantification_scalar(data_frame, column_label):
    data_frame = data_frame.copy()
    unique_values = set(data_frame[column_label])
    for value in unique_values:
        label = f'{column_label}_{value}'
        data_frame[label] = (data_frame[column_label] == value).astype(int)
    return data_frame.drop(columns=[column_label])

def quantification_set(data_frame, column_label):
    data_frame = data_frame.copy()
    unique_values = set()
    for values in data_frame[column_label]:        
        unique_values |= set(values.split(','))
    
    for value in unique_values:
        label = f'{column_label}_{value}'
        data_frame[label] = len(data_frame)*[0]
        for i in range(len(data_frame)):
            data_frame[label][i] = 1 if value in data_frame[column_label][i] else 0
        
    return data_frame.drop(columns=[column_label])


In [4]:
imdb_data = quantification_scalar(imdb_data, "Year")
imdb_data = quantification_set(imdb_data, "Genre")
imdb_data

Unnamed: 0,Title,Runtime,Rating,Votes,Revenue,Metascore,Year_2006,Year_2007,Year_2008,Year_2009,...,Genre_History,Genre_Music,Genre_Romance,Genre_War,Genre_Animation,Genre_Fantasy,Genre_Musical,Genre_Crime,Genre_Thriller,Genre_Family
0,Prometheus,124,7.0,485820,126.46,65.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: On Stranger Tides,136,6.7,395025,241.06,45.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,The Dark Knight,152,9.0,1791916,533.32,82.0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,The Prestige,130,8.5,913152,53.08,66.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Pirates of the Caribbean: At World's End,169,7.1,498821,309.40,50.0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,Taare Zameen Par,165,8.5,102697,1.20,42.0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
434,Resident Evil: Afterlife,97,5.9,140900,60.13,37.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435,Project X,88,6.7,164088,54.72,48.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
436,Hostel: Part II,94,5.5,73152,17.54,46.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:

def standardization(data_frame, columns_lebels=None, is_range_based=True):
    data_frame = data_frame.copy()
    if columns_lebels is None:
        columns_lebels = data_frame.columns

    for label in columns_lebels:
        column = data_frame[label]
        m =  column.mean()
        rng = column.max() - column.min()
        std = column.std()

        if is_range_based:
            data_frame[label] = (column - m) / rng
        else:
            data_frame[label] = (column - m) / std

    return data_frame

In [6]:
columns = set(imdb_data.columns) - {"Title"}
standardization(imdb_data, columns)

Unnamed: 0,Title,Runtime,Rating,Votes,Revenue,Metascore,Year_2006,Year_2007,Year_2008,Year_2009,...,Genre_History,Genre_Music,Genre_Romance,Genre_War,Genre_Animation,Genre_Fantasy,Genre_Musical,Genre_Crime,Genre_Thriller,Genre_Family
0,Prometheus,0.079885,0.010676,0.125262,0.036839,0.055317,-0.093607,-0.100457,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,-0.054795
1,Pirates of the Caribbean: On Stranger Tides,0.203596,-0.031578,0.074555,0.187531,-0.185647,-0.093607,-0.100457,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,0.860731,-0.011416,-0.157534,-0.189498,-0.054795
2,The Dark Knight,0.368545,0.292366,0.854697,0.571836,0.260136,-0.093607,-0.100457,0.890411,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,0.842466,-0.189498,-0.054795
3,The Prestige,0.141741,0.221944,0.363921,-0.059651,0.067365,0.906393,-0.100457,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,-0.054795
4,Pirates of the Caribbean: At World's End,0.543803,0.024760,0.132523,0.277394,-0.125406,-0.093607,0.899543,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,0.860731,-0.011416,-0.157534,-0.189498,-0.054795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,Taare Zameen Par,0.502566,0.221944,-0.088706,-0.127871,-0.221791,-0.093607,0.899543,-0.109589,-0.10274,...,-0.020548,0.972603,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,0.945205
434,Resident Evil: Afterlife,-0.198465,-0.144254,-0.067370,-0.050381,-0.282032,-0.093607,-0.100457,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,-0.054795
435,Project X,-0.291249,-0.031578,-0.054420,-0.057495,-0.149502,-0.093607,-0.100457,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,-0.054795
436,Hostel: Part II,-0.229393,-0.200592,-0.105206,-0.106384,-0.173599,-0.093607,0.899543,-0.109589,-0.10274,...,-0.020548,-0.027397,-0.148402,-0.013699,-0.050228,-0.139269,-0.011416,-0.157534,-0.189498,-0.054795


In [7]:
import sklearn as sk
import sklearn.cluster

def cluster(data_frame, clusters_number, clustering_columns_labels=None):
    data_frame = data_frame.copy()
    if clustering_columns_labels is None:
        clustering_columns_labels = data_frame.labels

    data_frame_normalized = standardization(data_frame, clustering_columns_labels)
    clustering_data = data_frame_normalized[clustering_columns_labels].to_numpy()
    k_means = sk.cluster.KMeans(n_clusters=clusters_number, init='random', n_init=300, max_iter=10000, algorithm='full', random_state=42)
    result = k_means.fit(clustering_data)
    data_frame['Cluster'] = result.labels_

    return data_frame, result.inertia_

In [8]:
def print_clusters_info(data_frame_clustered, item_id_label=None, clustering_columns_labels=None):
    if clustering_columns_labels is None:
        clustering_columns_labels = data_frame.labels

    clusters_number = data_frame_clustered['Cluster'].max() + 1
    
    for claster_id in range(clusters_number):
        statistics = pd.DataFrame()
        statistics[""] = ["Within cluster mean", "Grand mean", 'Difference', 'Difference, %']
        for column_label in clustering_columns_labels:
            grand_mean = data_frame_clustered[column_label].mean()
            within_cluster_mean = data_frame_clustered[column_label][data_frame_clustered['Cluster'] == claster_id].mean()
            difference = within_cluster_mean - grand_mean
            relative_difference = difference / grand_mean

            statistics[column_label] = [within_cluster_mean, grand_mean, difference, relative_difference]

        print(f'Cluster № {claster_id + 1}')
        print('------------------------------')
        display(statistics)
        if item_id_label is not None:
            print("Items:")
            items_in_claster = data_frame_clustered[item_id_label][data_frame_clustered['Cluster'] == claster_id]
            for item_id in items_in_claster:
                print(item_id)
        print()

In [9]:
columns = ['Rating', 'Votes',  'Revenue', 'Metascore']
n_clusters = 9

imdb_data_clustered, inertia = cluster(imdb_data, clusters_number=n_clusters, clustering_columns_labels=columns)
print(f'Inertia: {inertia}')
print()
print_clusters_info(imdb_data_clustered, item_id_label='Title', clustering_columns_labels=columns)

Inertia: 8.251742216292607

Cluster № 1
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,8.44,1315836.0,531.562,77.2
1,Grand mean,6.924201,261530.1,98.444269,60.408676
2,Difference,1.515799,1054306.0,433.117731,16.791324
3,"Difference, %",0.218913,4.0313,4.399624,0.277962


Items:
The Dark Knight
The Avengers
Inception
Avatar
The Dark Knight Rises

Cluster № 2
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,7.523944,241381.140845,47.171408,79.985915
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,0.599743,-20148.957328,-51.272861,19.57724
3,"Difference, %",0.086615,-0.077043,-0.520831,0.32408


Items:
Prisoners
Hot Fuzz
Her
Drive
Warrior
Into the Wild
Children of Men
Zodiac
There Will Be Blood
The Cabin in the Woods
La vie d'Adèle
Snowpiercer
Dallas Buyers Club
Lincoln
Rush
Looper
Midnight in Paris
Zombieland
Zero Dark Thirty
The imposible
Coraline
Blue Valentine
Jagten
The Town
American Gangster
The Lives of Others
Moonrise Kingdom
Hairspray
Under the Skin
(500) Days of Summer
Easy A
The Ghost Writer
Enchanted
Little Miss Sunshine
Moneyball
Män som hatar kvinnor
Let Me In
Hellboy II: The Golden Army
The Host
Knocked Up
Source Code
Sweeney Todd: The Demon Barber of Fleet Street
Eastern Promises
Gone Baby Gone
The Hurt Locker
Hunger
Tinker Tailor Soldier Spy
Incendies
Hugo
El secreto de sus ojos
True Grit
The Fighter
Captain Phillips
Rescue Dawn
The World's End
The Spectacular Now
Fantastic Mr. Fox
Inside Man
127 Hours
Jane Eyre
Melancholia
The Princess and the Frog
Flight
Locke
28 Weeks Later
Blue Jasmine
The Tree of Life
50/50
Bridge to Terabithia
Precious
The Descendants

C

Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,7.48,482035.333333,306.972333,69.5
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,0.555799,220505.23516,208.528064,9.091324
3,"Difference, %",0.080269,0.843135,2.118235,0.150497


Items:
Pirates of the Caribbean: Dead Man's Chest
Harry Potter and the Deathly Hallows: Part 2
The Hunger Games
Frozen
Iron Man
Transformers
Cars
The Hangover
Skyfall
The Hobbit: An Unexpected Journey
Iron Man Three
Man of Steel
Harry Potter and the Order of the Phoenix
Star Trek Into Darkness
The Amazing Spider-Man
Despicable Me
Monsters University
Harry Potter and the Deathly Hallows: Part 1
Mission: Impossible - Ghost Protocol
Iron Man 2
Harry Potter and the Half-Blood Prince
I Am Legend
The Hobbit: The Desolation of Smaug
The Hunger Games: Catching Fire
Brave
Toy Story 3
Indiana Jones and the Kingdom of the Crystal Skull
How to Train Your Dragon
Despicable Me 2
Kung Fu Panda

Cluster № 4
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,5.514583,119257.479167,59.459583,33.229167
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,-1.409618,-142272.619007,-38.984686,-27.179509
3,"Difference, %",-0.203578,-0.544001,-0.396008,-0.449927


Items:
Aliens vs Predator - Requiem
The Host
Battleship
Sucker Punch
The Three Musketeers
Jennifer's Body
Grown Ups 2
Couples Retreat
The Babysitters
Pandorum
I Am Number Four
Just Go with It
Safe Haven
Movie 43
The Happening
The Green Inferno
Clash of the Titans
What to Expect When You're Expecting
Ghost Rider
Adoration
Friday the 13th
Green Lantern
The Mortal Instruments: City of Bones
Jumper
Resident Evil: Retribution
Grown Ups
The Mummy: Tomb of the Dragon Emperor
Percy Jackson: Sea of Monsters
This Means War
The Purge
Lady in the Water
G.I. Joe: The Rise of Cobra
Disaster Movie
Fool's Gold
Dragonball Evolution
Red Dawn
G.I. Joe: Retaliation
Wrath of the Titans
The Ugly Truth
Bride Wars
The Human Centipede (First Sequence)
What's Your Number?
After Earth
Sex and the City 2
Legion
I Spit on Your Grave
Your Highness
Resident Evil: Afterlife

Cluster № 5
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,8.003846,624641.153846,142.896923,83.576923
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,1.079645,363111.055673,44.452654,23.168247
3,"Difference, %",0.155923,1.38841,0.451551,0.383525


Items:
The Prestige
Inglourious Basterds
The Wolf of Wall Street
The Departed
12 Years a Slave
No Country for Old Men
Shutter Island
Star Trek
Django Unchained
Superbad
Casino Royale
Pan's Labyrinth
Black Swan
The Social Network
Silver Linings Playbook
American Hustle
Slumdog Millionaire
The Bourne Ultimatum
District 9
Argo
Ratatouille
Up
Gravity
Juno
The King's Speech
Life of Pi

Cluster № 6
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,7.183333,350308.133333,137.657667,63.366667
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,0.259132,88778.03516,39.213397,2.957991
3,"Difference, %",0.037424,0.339456,0.398331,0.048966


Items:
Prometheus
The Help
Thor
300
Furious 6
The Great Gatsby
Watchmen
Pacific Rim
Crazy, Stupid, Love.
Tropic Thunder
Captain America: The First Avenger
X: First Class
Bridesmaids
Kick-Ass
The Girl with the Dragon Tattoo
Les Misérables
The Intouchables
The Conjuring
Rise of the Planet of the Apes
21 Jump Street
Oblivion
Now You See Me
Sherlock Holmes
The Devil Wears Prada
Lone Survivor
Fast Five
This Is the End
Wanted
The Wolverine
The Incredible Hulk
Tangled
Taken
The Curious Case of Benjamin Button
World War Z
Salt
The Other Guys
Cloverfield
Men in Black 3
Super 8
The Bourne Legacy
Limitless
Wreck-It Ralph
Megamind
Horrible Bosses
The Pursuit of Happyness
Public Enemies
Pineapple Express
The Heat
Blood Diamond
Live Free or Die Hard
Ted
RED
Happy Feet
Quantum of Solace
Mission: Impossible III
Rio
Real Steel
Ocean's Thirteen
Superman Returns
Talladega Nights: The Ballad of Ricky Bobby

Cluster № 7
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,6.337037,292733.074074,232.156667,50.037037
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,-0.587164,31202.975901,133.712397,-10.371639
3,"Difference, %",-0.084799,0.119309,1.358255,-0.171691


Items:
Pirates of the Caribbean: On Stranger Tides
Pirates of the Caribbean: At World's End
Twilight
Thor: The Dark World
We're the Millers
X-Men Origins: Wolverine
The Blind Side
Fast & Furious
Spider-Man 3
The Twilight Saga: Breaking Dawn - Part 2
Oz the Great and Powerful
The Proposal
Sherlock Holmes: A Game of Shadows
2012
Alice in Wonderland
Snow White and the Huntsman
Transformers: Dark of the Moon
Tron
X-Men: The Last Stand
Transformers: Revenge of the Fallen
The Karate Kid
The Da Vinci Code
Cars 2
Hancock
Sex and the City
The Twilight Saga: Breaking Dawn - Part 1
The Twilight Saga: Eclipse

Cluster № 8
------------------------------


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,6.471,138515.92,48.4158,48.8
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,-0.453201,-123014.178174,-50.028469,-11.608676
3,"Difference, %",-0.065452,-0.470363,-0.508191,-0.192169


Items:
Mamma Mia!
Diary of a Wimpy Kid
Spring Breakers
Predators
Percy Jackson & the Olympians: The Lightning Thief
The House Bunny
She's Out of My League
The Fast and the Furious: Tokyo Drift
Step Up
In Time
White House Down
Step Brothers
Robin Hood
John Carter
Kick-Ass 2
Carrie
Olympus Has Fallen
The Book of Eli
Law Abiding Citizen
Shooter
Half of a Yellow Sun
The Counselor
A Good Year
Jack Reacher
The Break-Up
The Expendables
Evil Dead
Sinister
All Good Things
Immortals
She's the Man
Never Back Down
Gangster Squad
Beautiful Creatures
The Lovely Bones
Underworld Awakening
Antichrist
Total Recall
Love & Other Drugs
American Reunion
Insidious
The First Time
Pain & Gain
Escape Plan
Seven Pounds
Australia
Faster
Closed Circuit
The Heartbreak Kid
The Strangers
Dark Shadows
The A-Team
The Internship
August Rush
Eagle Eye
The Fountain
Mr. Brooks
The Family
Zack and Miri Make a Porno
Man on a Ledge
No Strings Attached
Beowulf
Prince of Persia: The Sands of Time
Mama
Orphan
To Rome with Love


Unnamed: 0,Unnamed: 1,Rating,Votes,Revenue,Metascore
0,Within cluster mean,7.183099,163844.760563,27.659014,63.492958
1,Grand mean,6.924201,261530.098174,98.444269,60.408676
2,Difference,0.258898,-97685.33761,-70.785255,3.084282
3,"Difference, %",0.03739,-0.373515,-0.719039,0.051057


Items:
The Place Beyond the Pines
Scott Pilgrim vs. the World
Apocalypto
The Perks of Being a Wallflower
Kynodontas
Nymphomaniac: Vol. I
Cloud Atlas
Pitch Perfect
RocknRolla
Stardust
About Time
Magic Mike
Shame
Hanna
3 Idiots
Dredd
Revolutionary Road
Enemy
Marie Antoinette
Forgetting Sarah Marshall
Stake Land
Savages
The Boy in the Striped Pyjamas
Trust
Idiocracy
Sunshine
Moon
The Assassination of Jesse James by the Coward Robert Ford
Don Jon
The Secret Life of Walter Mitty
Lawless
Anna Karenina
The Mist
We Need to Talk About Kevin
Nymphomaniac: Vol. II
Friends with Benefits
The Illusionist
Paul
Body of Lies
Fracture
Oculus
In Bruges
Unknown
Seven Psychopaths
1408
The Imaginarium of Doctor Parnassus
Rocky Balboa
Unstoppable
42
Begin Again
Out of the Furnace
Vicky Cristina Barcelona
Freedom Writers
Changeling
Warm Bodies
Saving Mr. Banks
Babel
The Fall
What If
Slither
Contagion
Coherence
Notorious
Perfume: The Story of a Murderer
Centurion
The Kings of Summer
End of Watch
Lucky Number S