# Data Clean on IMDB Name Basic Data Set

In [1]:
# import libraries
import pandas as pd
import pickle
from datetime import date

In [2]:
# load the names basics dataset
nBasics = pd.read_csv('./data/name.basics.tsv', index_col=None, na_values=['NA'], sep='\t',low_memory=False)

nBasics.shape

(8726558, 6)

In [3]:
nBasics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0072308,tt0050419,tt0045537"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0071877,tt0038355,tt0037382,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0049189,tt0057345,tt0054452,tt0059956"
3,nm0000004,John Belushi,1949,1982,"actor,writer,soundtrack","tt0072562,tt0080455,tt0078723,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0060827,tt0083922,tt0050986"


In [4]:
# drop all movies that have no runtime value
nBasicsSub = nBasics[nBasics.deathYear == '\\N']

# display row & column counts
nBasicsSub.shape

(8578876, 6)

In [5]:
# drop all movies that have no runtime value
nBasicsSub = nBasicsSub[nBasicsSub.birthYear != '\\N']

# display row & column counts
nBasicsSub.shape

(282745, 6)

In [6]:
nBasicsSub.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0049189,tt0057345,tt0054452,tt0059956"
12,nm0000013,Doris Day,1922,\N,"soundtrack,actress,producer","tt0060463,tt0055100,tt0053172,tt0048317"
13,nm0000014,Olivia de Havilland,1916,\N,"actress,soundtrack","tt0031381,tt0041452,tt0040806,tt0029843"
17,nm0000018,Kirk Douglas,1916,\N,"actor,producer,soundtrack","tt0052365,tt0049456,tt0054331,tt0056195"
46,nm0000047,Sophia Loren,1934,\N,"actress,soundtrack","tt0060121,tt0076085,tt0054749,tt0058335"


In [7]:
nBasicsSub.birthYear.unique()

array(['1934', '1922', '1916', '1940', '1965', '1953', '1930', '1942',
       '1939', '1963', '1935', '1968', '1967', '1969', '1955', '1952',
       '1958', '1974', '1960', '1945', '1975', '1959', '1962', '1964',
       '1957', '1954', '1973', '1948', '1961', '1970', '1966', '1979',
       '1956', '1943', '1972', '1944', '1937', '1949', '1946', '1947',
       '1971', '1950', '1981', '1980', '1976', '1977', '1936', '1951',
       '1941', '1982', '1925', '1926', '1933', '1931', '1938', '1928',
       '1932', '1929', '1927', '1984', '1986', '1978', '1985', '1983',
       '1924', '1923', '1989', '1988', '1992', '1987', '1910', '1901',
       '1990', '1896', '1919', '1892', '1904', '1898', '1895', '1908',
       '1918', '1921', '1915', '1912', '1894', '1906', '1911', '1899',
       '1889', '1991', '1902', '1920', '1890', '1907', '1884', '1879',
       '1903', '1995', '1875', '1913', '1886', '1891', '1914', '1905',
       '1881', '1883', '1893', '1917', '1994', '1876', '1996', '1860',
      

In [8]:
# convert runtime minutes to an integer type
nBasicsSub.birthYear = pd.to_numeric(nBasicsSub.birthYear , errors='coerce')

In [9]:
nBasicsSub.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0049189,tt0057345,tt0054452,tt0059956"
12,nm0000013,Doris Day,1922,\N,"soundtrack,actress,producer","tt0060463,tt0055100,tt0053172,tt0048317"
13,nm0000014,Olivia de Havilland,1916,\N,"actress,soundtrack","tt0031381,tt0041452,tt0040806,tt0029843"
17,nm0000018,Kirk Douglas,1916,\N,"actor,producer,soundtrack","tt0052365,tt0049456,tt0054331,tt0056195"
46,nm0000047,Sophia Loren,1934,\N,"actress,soundtrack","tt0060121,tt0076085,tt0054749,tt0058335"


In [10]:
# drop all movies that have no runtime value
nBasicsSub = nBasicsSub[nBasicsSub.birthYear > 1899]

# display row & column counts
nBasicsSub.shape

(279812, 6)

In [11]:
def calculate_age(born):
    today = date.today()
    return today.year - born

In [12]:
nBasicsSub['Age'] = calculate_age(nBasicsSub.birthYear)

# display row & column counts
nBasicsSub.shape

(279812, 7)

In [13]:
nBasicsSub.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,Age
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0049189,tt0057345,tt0054452,tt0059956",84
12,nm0000013,Doris Day,1922,\N,"soundtrack,actress,producer","tt0060463,tt0055100,tt0053172,tt0048317",96
13,nm0000014,Olivia de Havilland,1916,\N,"actress,soundtrack","tt0031381,tt0041452,tt0040806,tt0029843",102
17,nm0000018,Kirk Douglas,1916,\N,"actor,producer,soundtrack","tt0052365,tt0049456,tt0054331,tt0056195",102
46,nm0000047,Sophia Loren,1934,\N,"actress,soundtrack","tt0060121,tt0076085,tt0054749,tt0058335",84


In [14]:
nBasicsSub.Age.unique()

array([ 84,  96, 102,  78,  53,  65,  88,  76,  79,  55,  83,  50,  51,
        49,  63,  66,  60,  44,  58,  73,  43,  59,  56,  54,  61,  64,
        45,  70,  57,  48,  52,  39,  62,  75,  46,  74,  81,  69,  72,
        71,  47,  68,  37,  38,  42,  41,  82,  67,  77,  36,  93,  92,
        85,  87,  80,  90,  86,  89,  91,  34,  32,  40,  33,  35,  94,
        95,  29,  30,  26,  31, 108, 117,  28,  99, 114, 110, 100,  97,
       103, 106, 112, 107,  27, 116,  98, 111, 115,  23, 105, 104, 113,
       101,  24,  22, 118,  20, 109,  25,  19,  21,  17,  18,   0,  16,
        12,  15,  14,  13,  11,  10,   9,   8,   7,   6,   5,   4,   3,
         2,   1], dtype=int64)

In [15]:
# drop all movies that have no runtime value
nBasicsSub = nBasicsSub[nBasicsSub.Age < 103]

# display row & column counts
nBasicsSub.shape

(275970, 7)

In [16]:
# save cleaned data set to a pickle file
nBasicsSub.to_pickle('v1_nBasicsCleaned.pkl')