In [15]:
import os
import pandas as pd
import numpy as np

In [3]:
DATA_FILE = os.path.join("data", "netflix_titles.csv")

## First DataFrame!

In [4]:
df = pd.read_csv(DATA_FILE)

df.head()
# df.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [5]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
df.dtypes

show_id          int64
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [7]:
df.describe()

Unnamed: 0,show_id,release_year
count,6234.0,6234.0
mean,76703680.0,2013.35932
std,10942960.0,8.81162
min,247747.0,1925.0
25%,80035800.0,2013.0
50%,80163370.0,2016.0
75%,80244890.0,2018.0
max,81235730.0,2020.0


In [8]:
df.shape

(6234, 12)

In [10]:
df.type.unique()

array(['Movie', 'TV Show'], dtype=object)

In [11]:
df['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

In [14]:
df.type.value_counts()

Movie      4265
TV Show    1969
Name: type, dtype: int64

### Series : Similar to numpy arrays

In [16]:
a = np.random.randint(0, 10, size=(5))
a

array([1, 7, 3, 8, 0])

In [17]:
s = pd.Series(a)
s

0    1
1    7
2    3
3    8
4    0
dtype: int64

In [27]:
labels = ['a', 'b', 'c', 'd', 'e']
a = np.random.randint(0, 10, size=(5))

s = pd.Series(a, index=labels)
s

a    3
b    2
c    3
d    6
e    1
dtype: int64

In [28]:
print("Both are same: {} {}".format(s[0], s['a']))

Both are same: 3 3


### What if index is repeating?

In [34]:
labels = ['a', 'b', 'c', 'a', 'a']
a = np.random.randint(0, 10, size=(5))

s = pd.Series(a, index=labels)
s

a    5
b    9
c    7
a    3
a    6
dtype: int64

In [33]:
print("s[0]:\n{}\n\ns['a']:\n{}\nTypes: s[0]: {} s['a']: {}".format(s[0], s['a'], type(s[0]), type(s['a'])))

s[0]:
3

s['a']:
a    3
a    3
a    2
dtype: int64
Types: s[0]: <class 'numpy.int64'> s['a']: <class 'pandas.core.series.Series'>


## Creating DataFrame

In [35]:
a = ['vinay', 'kuldeep', 'abhishek', 'siddharth', 'viren']
b = np.random.randint(1,45, size=(5))

In [36]:
a

['vinay', 'kuldeep', 'abhishek', 'siddharth', 'viren']

In [37]:
b

array([24, 16, 22, 41, 32])

In [41]:
df = pd.DataFrame({
    "name": a,
    "balance": b
})

df.head()

Unnamed: 0,name,balance
0,vinay,24
1,kuldeep,16
2,abhishek,22
3,siddharth,41
4,viren,32


In [42]:
df.name

0        vinay
1      kuldeep
2     abhishek
3    siddharth
4        viren
Name: name, dtype: object

In [43]:
df['name']

0        vinay
1      kuldeep
2     abhishek
3    siddharth
4        viren
Name: name, dtype: object

In [44]:
type(df.name)

pandas.core.series.Series

In [45]:
df['name'][0]

'vinay'

In [48]:
df.iloc[:2, :]

Unnamed: 0,name,balance
0,vinay,24
1,kuldeep,16


In [51]:
df['name'].iloc[:3]

0       vinay
1     kuldeep
2    abhishek
Name: name, dtype: object

In [53]:
df.loc[:, ['balance']]

Unnamed: 0,balance
0,24
1,16
2,22
3,41
4,32


In [57]:
df.balance > 30

0    False
1    False
2    False
3     True
4     True
Name: balance, dtype: bool

In [58]:
df.loc[df.balance > 30]

Unnamed: 0,name,balance
3,siddharth,41
4,viren,32


### Map

In [61]:
df['name'] = df['name'].map(lambda x: "codevector "+ x)

In [62]:
df

Unnamed: 0,name,balance
0,codevector vinay,24
1,codevector kuldeep,16
2,codevector abhishek,22
3,codevector siddharth,41
4,codevector viren,32


In [63]:
def example_function(x):
    return "from example "+ x

df['name'] = df['name'].map(example_function)
df

Unnamed: 0,name,balance
0,from example codevector vinay,24
1,from example codevector kuldeep,16
2,from example codevector abhishek,22
3,from example codevector siddharth,41
4,from example codevector viren,32


### Apply

In [71]:
def example_function(row):
    row['new_balance'] = int(row.balance) + 10
    return row

df.apply(example_function, axis='columns')

Unnamed: 0,name,balance,new_balance
0,from example codevector vinay,24,34
1,from example codevector kuldeep,16,26
2,from example codevector abhishek,22,32
3,from example codevector siddharth,41,51
4,from example codevector viren,32,42
