In [43]:
import pandas as pd
import numpy as np

movies = pd.read_table('data/movies/movies.dat', sep='::', engine='python', 
                       header=None, names=["movie_id", "title", "genres"], index_col=0)

movies = movies.iloc[:100]

all_genres = list({genre for row in movies['genres'] for genre in row.split('|') })

all_genres


['Sci-Fi',
 'Comedy',
 'Drama',
 'Crime',
 'Fantasy',
 'Romance',
 'Thriller',
 'Action',
 'Adventure',
 'Documentary',
 'Horror',
 'Musical',
 'Mystery',
 'War',
 "Children's",
 'Animation']

### Get Dummies
- converts array of values made of k-distinct lables into matrix
- matrix will have k columns of distinct lables
- if ith value of array is cat4 then ith row of the matrix will have 1 value at cat4 column else 0

In [58]:
rand_genres = np.random.choice(all_genres, (10))
rand_movies = pd.DataFrame(rand_genres, columns=['genre'])

display(rand_movies)
dummies = pd.get_dummies(rand_movies.genre)
dummies

Unnamed: 0,genre
0,Drama
1,Mystery
2,Action
3,Romance
4,Comedy
5,Adventure
6,War
7,Fantasy
8,Mystery
9,Fantasy


Unnamed: 0,Action,Adventure,Comedy,Drama,Fantasy,Mystery,Romance,War
0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0
5,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0
8,0,0,0,0,0,1,0,0
9,0,0,0,0,1,0,0,0


In [59]:
dummies.columns.get_indexer(['Drama', 'Mystery'])

array([3, 5])

### If Row Has Multiple Lables
- like in real movie data there will be multiple genres for a movie

In [66]:
## zeros matrix with rows equal to movies lenght and columns equal to all_genres
zero_matrix = np.zeros((len(movies), len(all_genres)))
dummies = pd.DataFrame(zero_matrix, columns=all_genres)

for i, genres in enumerate(movies.genres.head()):
    indices = dummies.columns.get_indexer(genres.split('|')) ## get columns indices in dummies for the genres
    dummies.iloc[i, indices] = 1 ## set 1 for columns for coresponding indices

## merge dummies and movies
movies.join(dummies) ## joins DataFrame columns matching value of index column

    

Unnamed: 0_level_0,title,genres,Sci-Fi,Comedy,Drama,Crime,Fantasy,Romance,Thriller,Action,Adventure,Documentary,Horror,Musical,Mystery,War,Children's,Animation
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Sabrina (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Tom and Huck (1995),Adventure|Children's,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dummies Often Used With Cut
- Cut categories numerical values in range
- Dummies can re-structure it like in above categories