## Sampling data (similar to what zecai did, should be replaced by zecai's samples)

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [7]:
# import all 25k data
df_feature = pd.read_csv("feature_final.csv", index_col = 0)
df_genre = pd.read_csv("genre_final.csv", index_col = 0)

# sampling 5000 entries 
feature = df_feature.sample(n=5000, random_state=0)
genre = df_genre.ix[feature.index.values,]

# separate into train and test sets
train_feature = feature[:2500]
train_genre = genre[:2500]

test_feature = feature[2500:]
test_genre = genre[2500:]

In [18]:
string_to_vector(train_feature, 'cast').shape

(2500, 50000)

In [23]:
train_feature['animation department']

imdb_ids
1715743                                                  NaN
36049                                                    NaN
30252                                                    NaN
310907                                                   NaN
1074191                                                  NaN
381911                                                   NaN
117690                                                   NaN
1642266                                                  NaN
1476428                                                  NaN
189981                                                   NaN
48001                                                    NaN
38471                                                    NaN
1479175    ['0405038', '0454014', '0460460', '1532547', '...
265086                                                   NaN
75610                                                    NaN
963915                                                   NaN
103768         

---------------------
## Danqing's part

In the following, I have assumed the following readily available inputs (as generated by zecai, or from the codes I have written above, they should be similar):

- train_feature: df with 2500 train observations, all feature columns, indexed with imdb_ids
- train_genre: df with 2500 train observations, all genre columns, indexed with imdb_ids
- test_feature: df with 2500 test observations, all feature columns, indexed with imdb_ids 
- test_genre: df with 2500 test observations, all genre columns, indexed with imdb_ids

Other parameters used in the following functions:
- val_name #feature of interest, eg. director
- val_n #top n values in each genre is taken
- pca_n #number of pca components to retain

## Common function by zecai, used in all functions below

In [24]:
### Function to convert to string list [u'Action', u'Adventure', u'Fantasy'] into dummy coding
## input: 
           # data = orignal data frame, 
           # val_name = name of the variable
## output: a data frame

def string_to_vector(data, val_name):
    
    from sklearn.feature_extraction.text import CountVectorizer
    
    # convert any np.nan to a string 'nan'
    data[val_name][pd.isnull(data[val_name])] = 'nan'
    
    vectorizer = CountVectorizer(analyzer = "word",   
                                             tokenizer = None,    
                                             preprocessor = None, 
                                             stop_words = None,   
                                             max_features = 50000)

    val_data = vectorizer.fit_transform(data[val_name])
    df_val = pd.DataFrame(val_data.toarray())
    df_val.columns = vectorizer.get_feature_names()
    df_val.index = data.index
    
    return df_val

## 1. FUNCTION for Director, Writer

In [25]:
### Function considers a particular feature of interest (e.g. director, writer) 
### and picks out the top val_n most important value in each genre
### Input: 
# - train_feature: df with 2500 train observations, all feature columns, indexed with imdb_ids
# - train_genre: df with 2500 train observations, all genre columns, indexed with imdb_ids
# - test_feature: df with 2500 test observations, all feature columns, indexed with imdb_ids 
# - test_genre: df with 2500 test observations, all genre columns, indexed with imdb_ids
# - val_name # feature of interest, eg. director
# - val_n # number of top values to take, eg. 1
### Output:
# - train_val: df with 2500 train observations, new columns of top directors, indexed with imdb_ids
# - test_val: df with 2500 test observations, new columns of top directors, indexed with imdb_ids

def top_features(train_feature = train_feature,
                train_genre = train_genre,
                test_feature = test_feature,
                test_genre = test_genre,
                val_name = 'director',
                val_n = 1):

    # convert feature of interest into dummy variables in train set and test set
    feature_val_train = string_to_vector(train_feature, val_name)
    feature_val_test = string_to_vector(test_feature, val_name)

    # create a dataframe with columns consisting of all directors and all genres, rows are movie entries 
    feature_val_genre = pd.concat([feature_val_train, train_genre], axis = 1)
    
    # generate list of top directors in each genre 
    val_list = []
    for i in genre.columns:
        sum_val_in_genre = feature_val_genre.ix[feature_val_genre[i] == 1, range(feature_val_train.shape[1]-1)].sum(axis = 0)
        sum_val_in_genre_sorted = sum_val_in_genre.sort(inplace=False, ascending = False)
        for j in range(val_n):
            top_val_in_genre = sum_val_in_genre_sorted.index[j]
            val_list.append(top_val_in_genre)      

    # output dataframes of movies with new columns
    train_val = feature_val_train.ix[:, val_list] 
    test_val = feature_val_test.ix[:, val_list]
    
    # replace any NA values in test set with 0 
    test_val = test_val.fillna(value=0)
    
    # rename columns as director1, director2, etc
    col_names = []
    for i in range(train_val.shape[1]):
            i_name = val_name + str(i)
            col_names.append(i_name)
    train_val.columns = col_names
    test_val.columns = col_names

    return(train_val, test_val)

In [26]:
train_val, test_val = top_features(train_feature = train_feature,
                train_genre = train_genre,
                test_feature = test_feature,
                test_genre = test_genre,
                val_name = 'director',
                val_n = 1)

## 2. FUNCTION for Cast, production companies

In [27]:
### Function considers a particular feature of interest (e.g. cast, production companies) 
### and picks out the top val_n most important value in each genre
### and then performs PCA, picks out the top pca_n numbers of components
### Input: 
# - train_feature: df with 2500 train observations, all feature columns, indexed with imdb_ids
# - train_genre: df with 2500 train observations, all genre columns, indexed with imdb_ids
# - test_feature: df with 2500 test observations, all feature columns, indexed with imdb_ids 
# - test_genre: df with 2500 test observations, all genre columns, indexed with imdb_ids
# - val_name # feature of interest, eg. director
# - val_n # number of top values to take, eg. 5
# - pca_n # number of pca components to retain, eg. 5
### Output:
# - train_val: df with 3000 train observations, columns as pca components, indexed with imdb_ids
# - test_val: df with 2000 test observations, columns as pca components, indexed with imdb_ids

def top_features_pca(train_feature = train_feature,
                     train_genre = train_genre,
                     test_feature = test_feature,
                     test_genre = test_genre,
                     val_name = 'cast',
                     val_n = 5,
                     pca_n = 5):
    
    ## Step 1, pick top casts in each genre using top_feature function 
    train_val, test_val = top_features(train_feature = train_feature,
                    train_genre = train_genre,
                    test_feature = test_feature,
                    test_genre = test_genre,
                    val_name = val_name,
                    val_n = val_n)
    
    # replace any NA values in test set with 0 (or will have problem in pca.transform)
    test_val = test_val.fillna(value=0)

    ## Step 2, perform PCA
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components = pca_n, svd_solver = "full") # keep the first pca_n PCs
    pca = pca.fit(train_val)
    train_pca = pd.DataFrame(pca.transform(train_val), index = train_val.index)
    test_pca = pd.DataFrame(pca.transform(test_val), index = test_val.index)

    ## Step 3, rename the columns as cast_PC1, cast_PC2, etc. 
    col_names = []
    for i in range(pca_n):
            i_name = val_name + "_PC" + str(i+1)
            col_names.append(i_name)
    train_pca.columns = col_names
    test_pca.columns = col_names

    return (train_pca, test_pca)

## 3. FUNCTION for Animation department, original music

In [28]:
### Function considers a particular feature of interest (e.g. animation department, original music) 
### and counts the number of member occurance in each movie 
### Input: 
# - train_feature: df with 2500 train observations, all feature columns, indexed with imdb_ids
# - test_feature: df with 2500 test observations, all feature columns, indexed with imdb_ids 
# - val_name # feature of interest, eg. animation department
### Output:
# - train_count: df with 2500 train observations, new column of count, indexed with imdb_ids
# - test_count: df with 2500 test observations, new column of count, indexed with imdb_ids

def feature_to_count(train_feature = train_feature,
                test_feature = test_feature,
                val_name = 'animation department'):
    
    # convert column to number of counts of members 
    train_count = pd.DataFrame(string_to_vector(train_feature, val_name).ix[:,:-1].sum(axis=1), 
                               columns = {val_name + ' count'})
    test_count = pd.DataFrame(string_to_vector(test_feature, val_name).ix[:,:-1].sum(axis=1), 
                              columns = {val_name + ' count'})
    
    return(train_count, test_count)


## 4. Combining everything into one dataframe

In [29]:
### Combines information from director, writer, cast, production companies, animation department, original music
### Input: none
### Output:     
# - train_combined: 2500 x 75 dataframe
# - test_combined: 2500 x 75 dataframe
    
def combine_all_danqing():
    train_1, test_1 = top_features(train_feature = train_feature,
                    train_genre = train_genre,
                    test_feature = test_feature,
                    test_genre = test_genre,
                    val_name = 'director',
                    val_n = 1)
    
    train_2, test_2 = top_features(train_feature = train_feature,
                    train_genre = train_genre,
                    test_feature = test_feature,
                    test_genre = test_genre,
                    val_name = 'writer',
                    val_n = 1)

    train_3, test_3 = top_features_pca(train_feature = train_feature,
                         train_genre = train_genre,
                         test_feature = test_feature,
                         test_genre = test_genre,
                         val_name = 'cast',
                         val_n = 5,
                         pca_n = 10)

    train_4, test_4 = top_features_pca(train_feature = train_feature,
                         train_genre = train_genre,
                         test_feature = test_feature,
                         test_genre = test_genre,
                         val_name = 'production companies',
                         val_n = 5,
                         pca_n = 5)

    train_5, test_5 = feature_to_count(train_feature = train_feature,
                    test_feature = test_feature,
                    val_name = 'animation department')

    train_6, test_6 = feature_to_count(train_feature = train_feature,
                    test_feature = test_feature,
                    val_name = 'original music')

    # Final output consisting information for 
    # director, writer, cast, production companies, animation department, original music
    # train_combined: 2500 x 75 dataframe
    # test_combined: 2500 x 75 dataframe
    train_combined = pd.concat([train_1, train_2, train_3, train_4, train_5, train_6], axis = 1)
    test_combined = pd.concat([test_1, test_2, test_3, test_4, test_5, test_6], axis = 1)
    
    return(train_combined, test_combined)

In [30]:
train_combined, test_combined = combine_all_danqing()

In [33]:
train_combined['animation department count']

imdb_ids
1715743      0
36049        0
30252        0
310907       0
1074191      0
381911       0
117690       0
1642266      0
1476428      0
189981       0
48001        0
38471        0
1479175      5
265086       0
75610        0
963915       0
103768       0
80438        0
1029172      0
847212       0
1239310      0
61664        0
438129       0
1258935      0
67633        0
285131       0
177242       0
94245        0
796302       0
47582        0
          ... 
101371       0
122474       0
319297       0
309912       0
1054122      0
112342       1
56049        0
1216640      0
76878        0
101489       0
111275       0
69954        0
160644       0
99878      177
1648062      0
455915       0
159241       0
106246       0
44941        0
1241325      0
985694       0
326905       1
105601       0
72136        0
24710        0
61858        0
1430641      0
418676       0
34398        0
49414        0
Name: animation department count, dtype: int64

In [602]:
test_combined.head()

Unnamed: 0_level_0,director0,director1,director2,director3,director4,director5,director6,director7,director8,director9,...,cast_PC8,cast_PC9,cast_PC10,production companies_PC1,production companies_PC2,production companies_PC3,production companies_PC4,production companies_PC5,animation department count,original music count
imdb_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106961,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.002106,-0.003081,-0.006541,-0.081953,-0.060032,-0.074165,-0.026083,0.000185,0,0
113810,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.002106,-0.003081,-0.006541,-0.081953,-0.060032,-0.074165,-0.026083,0.000185,0,1
264476,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.002106,-0.003081,-0.006541,-0.081953,-0.060032,-0.074165,-0.026083,0.000185,0,1
374312,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.002106,-0.003081,-0.006541,-0.081953,-0.060032,-0.074165,-0.026083,0.000185,0,0
78239,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,...,0.002106,-0.003081,-0.006541,-0.081953,-0.060032,-0.074165,-0.026083,0.000185,0,0
