# 7.1  Handling Missing Data

In [1]:
# Making a Pandas Series
import pandas as pd
import numpy as np

In [2]:
seriesName = pd.Series(['ad','bk','cz',np.nan,'kc','ut',np.nanmax])

In [3]:
seriesName

0                                     ad
1                                     bk
2                                     cz
3                                    NaN
4                                     kc
5                                     ut
6    <function nanmax at 0x7fdba0263e18>
dtype: object

In [4]:
# Check if null is present

In [5]:
seriesName.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
6    False
dtype: bool

In [6]:
seriesName.isna()

0    False
1    False
2    False
3     True
4    False
5    False
6    False
dtype: bool

In [7]:
seriesName.isnull() == seriesName.isna()

0    True
1    True
2    True
3    True
4    True
5    True
6    True
dtype: bool

In [8]:
seriesName.isnull() == seriesName.unique

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [9]:
seriesName.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
6    False
dtype: bool

### handling methods to use

In [10]:
# dropna will drop all the existing null values

In [11]:
seriesName.dropna()

0                                     ad
1                                     bk
2                                     cz
4                                     kc
5                                     ut
6    <function nanmax at 0x7fdba0263e18>
dtype: object

In [12]:
# Creating a dataframe


In [13]:
data = {'state':['ArunachalPradesh','Meghalaya','Assam','Tripura','Nagaland','Manipur','Mizoram'],
       'capitals':['Itanagar','Shillong','Guwahati','Agartala','Kohima','','Aizwal'],
        'population':[71,89,64,np.nan,56,np.nan,9]
       }

In [14]:
pd.DataFrame(data=data)

Unnamed: 0,state,capitals,population
0,ArunachalPradesh,Itanagar,71.0
1,Meghalaya,Shillong,89.0
2,Assam,Guwahati,64.0
3,Tripura,Agartala,
4,Nagaland,Kohima,56.0
5,Manipur,,
6,Mizoram,Aizwal,9.0


In [15]:
pd.DataFrame(data=data,columns=['State','Captials'])

Unnamed: 0,State,Captials


In [16]:
pd.DataFrame(data=data,columns=['state','capitals'])

Unnamed: 0,state,capitals
0,ArunachalPradesh,Itanagar
1,Meghalaya,Shillong
2,Assam,Guwahati
3,Tripura,Agartala
4,Nagaland,Kohima
5,Manipur,
6,Mizoram,Aizwal


In [17]:
pd.DataFrame(data=data,columns=['state','capitals','population'])

Unnamed: 0,state,capitals,population
0,ArunachalPradesh,Itanagar,71.0
1,Meghalaya,Shillong,89.0
2,Assam,Guwahati,64.0
3,Tripura,Agartala,
4,Nagaland,Kohima,56.0
5,Manipur,,
6,Mizoram,Aizwal,9.0


In [18]:
dframe= pd.DataFrame(data=data)

In [19]:
dframe

Unnamed: 0,state,capitals,population
0,ArunachalPradesh,Itanagar,71.0
1,Meghalaya,Shillong,89.0
2,Assam,Guwahati,64.0
3,Tripura,Agartala,
4,Nagaland,Kohima,56.0
5,Manipur,,
6,Mizoram,Aizwal,9.0


In [20]:
# dframe.dropna()
dframe.isna()

Unnamed: 0,state,capitals,population
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,True
4,False,False,False
5,False,False,True
6,False,False,False


In [21]:
dframe.isnull()

Unnamed: 0,state,capitals,population
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,True
4,False,False,False
5,False,False,True
6,False,False,False


In [22]:
dframe['population'] = np.arange(9,16)

In [23]:
dframe

Unnamed: 0,state,capitals,population
0,ArunachalPradesh,Itanagar,9
1,Meghalaya,Shillong,10
2,Assam,Guwahati,11
3,Tripura,Agartala,12
4,Nagaland,Kohima,13
5,Manipur,,14
6,Mizoram,Aizwal,15


In [24]:
dframe['HasCaptial']=(dframe['capitals']=='')

In [25]:
dframe

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


In [26]:
dframe.dropna()

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


### Reset index after dropna causes the index to be removed

In [27]:
dframeAfterDrop = dframe.dropna()

In [28]:
dframeAfterDrop.reset_index()

Unnamed: 0,index,state,capitals,population,HasCaptial
0,0,ArunachalPradesh,Itanagar,9,False
1,1,Meghalaya,Shillong,10,False
2,2,Assam,Guwahati,11,False
3,3,Tripura,Agartala,12,False
4,4,Nagaland,Kohima,13,False
5,5,Manipur,,14,True
6,6,Mizoram,Aizwal,15,False


In [29]:
dframeAfterDrop = dframe.dropna(how='all')

In [30]:
dframeAfterDrop = dframe.dropna(thresh=2)

In [31]:
dframeAfterDrop

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


## Fill Missing values

Filling missing data instead of removing it is a better option

In [32]:
dframe

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


In [33]:
dframe.fillna({'population':5})

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


In [34]:
# dframeAfterDrop = dframe.fillna(0, inplace=True)
dframeAfterDrop = dframe.fillna(5)

In [35]:
dframeAfterDrop

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


In [36]:
dframeAfterDrop = dframe.fillna(method='ffill')

In [37]:
dframeAfterDrop

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


## Mean Values are inserted instead of 

In [38]:
dframeAfterDrop = dframe.fillna(dframe['population'].mean())

In [39]:
dframeAfterDrop

Unnamed: 0,state,capitals,population,HasCaptial
0,ArunachalPradesh,Itanagar,9,False
1,Meghalaya,Shillong,10,False
2,Assam,Guwahati,11,False
3,Tripura,Agartala,12,False
4,Nagaland,Kohima,13,False
5,Manipur,,14,True
6,Mizoram,Aizwal,15,False


# 7.2 Data Transformation

In [40]:
data = pd.DataFrame({'k1':['one','two']*3 + ['two'],'k2':[1,1,2,3,3,4,4]})

In [41]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


### Removing Duplicates

In [42]:
#Check if row is duplicated

data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [43]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [44]:
# drop duplicate of a particular row

data.drop_duplicates('k1')

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [45]:
data.drop_duplicates('k2')

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [46]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [47]:
data.drop_duplicates(['k1','k2'], keep='last').reset_index()

Unnamed: 0,index,k1,k2
0,0,one,1
1,1,two,1
2,2,one,2
3,3,two,3
4,4,one,3
5,6,two,4


### Transforming Data Using a Function or Mapping

In [48]:
data = pd.DataFrame({'food':['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'], 'ounces': np.arange(6,15)})
print(data)
data = pd.DataFrame({'food':['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'], 'ounces': np.random.randint(2,6)})
print(data)
data = pd.DataFrame({'food':['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'], 'ounces': np.random.randn(9)})
print(data)


          food  ounces
0        bacon       6
1  pulled pork       7
2        bacon       8
3     Pastrami       9
4  corned beef      10
5        Bacon      11
6     pastrami      12
7    honey ham      13
8     nova lox      14
          food  ounces
0        bacon       2
1  pulled pork       2
2        bacon       2
3     Pastrami       2
4  corned beef       2
5        Bacon       2
6     pastrami       2
7    honey ham       2
8     nova lox       2
          food    ounces
0        bacon  0.544803
1  pulled pork  1.842590
2        bacon -0.900193
3     Pastrami  1.364952
4  corned beef -0.077335
5        Bacon  1.454325
6     pastrami -1.814206
7    honey ham  1.736018
8     nova lox  1.337869


In [49]:
data = pd.DataFrame({'food':['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'], 'ounces': [2,3,5,8,9,7,6,1,2]})

In [50]:
data

Unnamed: 0,food,ounces
0,bacon,2
1,pulled pork,3
2,bacon,5
3,Pastrami,8
4,corned beef,9
5,Bacon,7
6,pastrami,6
7,honey ham,1
8,nova lox,2


Suppose you wanted to add a column indicating the type of animal that each food
came from.

In [51]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [52]:
data['food'].str.lower()

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [53]:
dataFoodLoweCase = data['food'].str.lower()

In [54]:
dataFoodLoweCase

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [55]:
data['animal'] = dataFoodLoweCase.map(meat_to_animal)

In [56]:
data

Unnamed: 0,food,ounces,animal
0,bacon,2,pig
1,pulled pork,3,pig
2,bacon,5,pig
3,Pastrami,8,cow
4,corned beef,9,cow
5,Bacon,7,pig
6,pastrami,6,cow
7,honey ham,1,pig
8,nova lox,2,salmon


In [57]:
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [58]:
data

Unnamed: 0,food,ounces,animal
0,bacon,2,pig
1,pulled pork,3,pig
2,bacon,5,pig
3,Pastrami,8,cow
4,corned beef,9,cow
5,Bacon,7,pig
6,pastrami,6,cow
7,honey ham,1,pig
8,nova lox,2,salmon


### Using map is a convenient way to perform element-wise transformations and other data cleaning–related operations.

## Replacing Values

In [59]:
data = pd.Series([1,-999,2,-9696,3,-987])
data

0       1
1    -999
2       2
3   -9696
4       3
5    -987
dtype: int64

In [60]:
data.replace([-999,-9696,-987],[np.nan,0,np.nan])

0    1.0
1    NaN
2    2.0
3    0.0
4    3.0
5    NaN
dtype: float64

In [61]:
data.replace({-999: np.nan})

0       1.0
1       NaN
2       2.0
3   -9696.0
4       3.0
5    -987.0
dtype: float64

In [62]:
data.replace({-999: np.nan, -987: 0, -9696: np.nan})

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
5    0.0
dtype: float64

## Renaming Axis Indexes

In [63]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'NewYork'],
columns=['one', 'two', 'three', 'four'])

In [64]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
NewYork,8,9,10,11


In [65]:
transform = lambda x: x[:4].upper()


In [66]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEWY'], dtype='object')

In [67]:
data.index

Index(['Ohio', 'Colorado', 'NewYork'], dtype='object')

In [68]:
data.rename(index={'Ohio':'SILCHAR'},columns={'one':'cachar'})

Unnamed: 0,cachar,two,three,four
SILCHAR,0,1,2,3
Colorado,4,5,6,7
NewYork,8,9,10,11


## Discretization and Binning

Continuous data is often discretized or otherwise separated into “bins” for analysis.
Suppose you have data about a group of people in a study, and you want to group
them into discrete age buckets

In [69]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [70]:
divide_into_age_group = [18,30,45,65]

In [71]:
human_age_buckets = pd.cut(ages, divide_into_age_group)

In [72]:
human_age_buckets

[(18, 30], (18, 30], (18, 30], (18, 30], (18, 30], ..., (30, 45], (45, 65], (30, 45], (30, 45], (30, 45]]
Length: 12
Categories (3, interval[int64]): [(18, 30] < (30, 45] < (45, 65]]

In [73]:
human_age_buckets.codes

array([0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1], dtype=int8)

In [74]:
pd.value_counts(human_age_buckets)

(18, 30]    6
(30, 45]    5
(45, 65]    1
dtype: int64

In [75]:
pd.value_counts(human_age_buckets.codes)

0    6
1    5
2    1
dtype: int64

In [76]:
human_age_buckets.categories


IntervalIndex([(18, 30], (30, 45], (45, 65]]
              closed='right',
              dtype='interval[int64]')

In [77]:
lables = ['Young','Mid Age', 'Old Niggas']

In [78]:
human_age_buckets = pd.cut(ages, divide_into_age_group, labels=lables)

In [79]:
human_age_buckets

[Young, Young, Young, Young, Young, ..., Mid Age, Old Niggas, Mid Age, Mid Age, Mid Age]
Length: 12
Categories (3, object): [Young < Mid Age < Old Niggas]

In [80]:
data = np.random.randn(20)

In [81]:
data

array([-0.13477331,  0.76852521,  0.8419945 ,  0.45138588, -0.51682787,
        0.1923765 , -0.62062024, -1.65983287,  1.23278545, -0.60212215,
        1.24689249, -0.20322171, -1.14588644,  1.88269984,  0.29226642,
        1.08888904,  1.35049082,  0.97121859,  0.1792373 , -0.48383725])

In [82]:
pd.cut(data, 4, precision=2, retbins=True)

([(-0.77, 0.11], (0.11, 1.0], (0.11, 1.0], (0.11, 1.0], (-0.77, 0.11], ..., (1.0, 1.88], (1.0, 1.88], (0.11, 1.0], (0.11, 1.0], (-0.77, 0.11]]
 Length: 20
 Categories (4, interval[float64]): [(-1.66, -0.77] < (-0.77, 0.11] < (0.11, 1.0] < (1.0, 1.88]],
 array([-1.66337541, -0.77419969,  0.11143348,  0.99706666,  1.88269984]))

In [83]:
pd.value_counts(pd.cut(data, 6, precision=3))

(0.702, 1.292]      6
(0.111, 0.702]      4
(-1.069, -0.479]    4
(1.292, 1.883]      2
(-0.479, 0.111]     2
(-1.663, -1.069]    2
dtype: int64

## Detecting and Filtering Outliers

Filtering or transforming outliers is largely a matter of applying array operations.
Consider a DataFrame with some normally distributed data

In [84]:
data = np.random.randn(1000,4)

In [85]:
data

array([[-2.22424667, -1.16451236,  0.12668986,  0.31124346],
       [ 1.61215463,  0.12020221, -0.26562143, -2.24147382],
       [-0.94624682, -0.76453071, -0.1270019 ,  0.05648948],
       ..., 
       [-2.55266228,  0.45611839, -0.33959642,  0.40130379],
       [ 0.82921689,  0.40858806,  1.73720709, -0.38296221],
       [-1.20318706, -1.12058573,  0.24330977, -1.20891081]])

In [86]:
df = pd.DataFrame(data=data)

In [87]:
df.head()

Unnamed: 0,0,1,2,3
0,-2.224247,-1.164512,0.12669,0.311243
1,1.612155,0.120202,-0.265621,-2.241474
2,-0.946247,-0.764531,-0.127002,0.056489
3,-0.868969,-0.812931,-0.817962,-0.257771
4,-0.233549,0.666187,0.589928,-1.126974


In [88]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.032945,-0.006669,0.011834,0.024676
std,1.013058,0.928225,1.030716,1.05527
min,-3.308834,-3.873404,-2.95477,-3.248468
25%,-0.751922,-0.60907,-0.69979,-0.683682
50%,-0.028365,0.019574,0.028128,0.029722
75%,0.648028,0.619019,0.706726,0.69479
max,3.108813,3.113652,4.544697,3.367479


In [89]:
data = df[2]

In [90]:
data[np.abs(data)>2]

5      2.234496
30     2.601591
31    -2.084000
36    -2.286647
50     2.457248
62     4.544697
66    -2.954770
67     2.156534
75     2.546508
88    -2.374444
96     2.047977
113   -2.305997
145    2.059274
156   -2.411381
161   -2.256771
185   -2.221146
193   -2.031717
210   -2.471396
215   -2.450972
222    2.852838
227    3.716565
261   -2.477104
362   -2.696446
404    2.310195
474   -2.198228
502    2.347049
512   -2.757404
528   -2.392674
549    2.364572
556    3.311156
561   -2.048111
639   -2.370158
661    2.515824
708   -2.761631
753    2.239616
757   -2.517787
772    2.306768
779   -2.083778
808    2.165182
810   -2.143420
833    2.039332
847    2.349957
907    2.409885
944   -2.079035
958    2.234072
979   -2.028559
982    2.273160
Name: 2, dtype: float64

In [91]:
df[(np.abs(df) > 3).any(1)]

Unnamed: 0,0,1,2,3
41,3.108813,0.228283,-0.035777,-0.037869
62,-0.497176,0.02999,4.544697,0.531571
67,0.078648,-0.59446,2.156534,-3.00012
101,1.543877,-0.204544,-1.720038,3.162468
110,3.012775,0.782148,0.02203,-0.121043
227,0.873164,-0.712579,3.716565,0.557379
248,-0.656732,0.087453,0.211792,3.367479
355,1.318624,0.109147,-1.375672,3.077765
401,-1.185988,-0.403202,0.234133,-3.248468
445,-0.393707,-0.639718,-0.672529,3.066209


In [92]:
df[(np.abs(df) > 3).any(1)] = np.random.uniform(0.1,0.2)

In [93]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.035579,0.003063,0.005521,0.017229
std,0.986245,0.907008,0.995231,1.006188
min,-2.895353,-2.950602,-2.95477,-2.97328
25%,-0.738325,-0.591176,-0.690234,-0.661507
50%,-0.017099,0.042327,0.052916,0.056618
75%,0.624731,0.609086,0.69078,0.66928
max,2.809926,2.741967,2.852838,2.878889


In [94]:
# Movielens 1Mn DataSet
names = ['movie_id', 'title', 'genres']

In [102]:
movies = pd.read_table('Chap7_data/movies.dat', sep='::', header=None, names=names)

  """Entry point for launching an IPython kernel.


In [103]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [104]:
# Set index to movie_id

movies.set_index('movie_id')

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children's
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [105]:
all_genres = []

In [107]:
# Extracting all the Genres in the list and storing in all_genres and finding all the unique genres
for x in movies.genres:
    all_genres.extend(x.split('|'))

all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [108]:
genres = pd.unique(all_genres)

In [110]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

## String Manipulations

In [111]:
val = 'a,b, github'

In [112]:
val_arr = val.split(',')

In [113]:
val_arr

['a', 'b', ' github']

#### Remove spaces from val_arr

In [114]:
pieces = [x.strip() for x in val_arr]

In [115]:
pieces

['a', 'b', 'github']

#### Concatenate python way

In [116]:
'::'.join(pieces)

'a::b::github'