The main data structures pandas provides are Series and DataFrames. After a brief introduction to these two data structures and data ingestion, the key features of pandas this notebook covers are:

Generating descriptive statistics on data
Data cleaning using built in pandas functions
Frequent data operations for subsetting, filtering, insertion, deletion and aggregation of data
Merging multiple datasets using dataframes
Working with timestamps and time-series data

http://pandas.pydata.org/pandas-docs/stable/

In [1]:
import pandas as pd
ser=pd.Series(data=[100,200,300,400,500], index=['tom', 'bob','nancy','dan','eric'])



**importing csv file in pandas**

In [50]:
import pandas as pd
movies=pd.read_csv("D:\pandas\movies.csv")
ratings=pd.read_csv("D:/pandas/ratings.csv")

In [51]:
movies.head()
type(movies)

pandas.core.frame.DataFrame

In [52]:
print(movies)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [53]:
ser

tom      100
bob      200
nancy    300
dan      400
eric     500
dtype: int64

In [54]:
ser.index

Index(['tom', 'bob', 'nancy', 'dan', 'eric'], dtype='object')

In [55]:
ser['nancy']

300

In [56]:
ser[[4,3,1]]

eric    500
dan     400
bob     200
dtype: int64

In [57]:
'sonam' in ser

False

In [58]:
'bob' in ser

True

 in keyword to find wheter elemet present in data or not

In [59]:
ser

tom      100
bob      200
nancy    300
dan      400
eric     500
dtype: int64

In [60]:
ser*3

tom       300
bob       600
nancy     900
dan      1200
eric     1500
dtype: int64

multiplication of the values in ser

In [61]:
ser**2

tom       10000
bob       40000
nancy     90000
dan      160000
eric     250000
dtype: int64

squre of values in ser

# pandas dataframe

In [62]:
d = {'one' : pd.Series([100., 200., 300.], index=['apple', 'ball', 'clock']),
     'two' : pd.Series([111., 222., 333., 4444.], index=['apple', 'ball', 'cerill', 'dancy'])}

In [63]:
df=pd.DataFrame(d)

In [64]:
df

Unnamed: 0,one,two
apple,100.0,111.0
ball,200.0,222.0
cerill,,333.0
clock,300.0,
dancy,,4444.0


In [65]:
df.index

Index(['apple', 'ball', 'cerill', 'clock', 'dancy'], dtype='object')

In [66]:
df.columns

Index(['one', 'two'], dtype='object')

In [67]:
pd.DataFrame(d, index=['dancy','ball', 'apple'])

Unnamed: 0,one,two
dancy,,4444.0
ball,200.0,222.0
apple,100.0,111.0


# create dataframe from the list of dictionary

In [68]:
data=[{'a':1,'b':2},{'z':100,'apple':200,'red':300}]
pd.DataFrame(data)

Unnamed: 0,a,b,z,apple,red
0,1.0,2.0,,,
1,,,100.0,200.0,300.0


In [69]:
pd.DataFrame(data, columns=['joe', 'dora','alice'])


Unnamed: 0,joe,dora,alice
0,,,
1,,,


In [70]:
# Basic dataframe operations

In [71]:
df

Unnamed: 0,one,two
apple,100.0,111.0
ball,200.0,222.0
cerill,,333.0
clock,300.0,
dancy,,4444.0


In [72]:
df['one']


apple     100.0
ball      200.0
cerill      NaN
clock     300.0
dancy       NaN
Name: one, dtype: float64

In [73]:
df['three']=df['one']*df['two']
df

Unnamed: 0,one,two,three
apple,100.0,111.0,11100.0
ball,200.0,222.0,44400.0
cerill,,333.0,
clock,300.0,,
dancy,,4444.0,


In [74]:
df['flag']=df['one']>250    # if one>250: store.flag otherwise pass

df

Unnamed: 0,one,two,three,flag
apple,100.0,111.0,11100.0,False
ball,200.0,222.0,44400.0,False
cerill,,333.0,,False
clock,300.0,,,True
dancy,,4444.0,,False


In [75]:
two=df.pop('two')

In [76]:
two

apple      111.0
ball       222.0
cerill     333.0
clock        NaN
dancy     4444.0
Name: two, dtype: float64

In [77]:
df

Unnamed: 0,one,three,flag
apple,100.0,11100.0,False
ball,200.0,44400.0,False
cerill,,,False
clock,300.0,,True
dancy,,,False


In [78]:
del df['one']

In [79]:
df

Unnamed: 0,three,flag
apple,11100.0,False
ball,44400.0,False
cerill,,False
clock,,True
dancy,,False


In [80]:
df.insert(0, 'copy_of_three',df['three'])  # 0 is index value
df

Unnamed: 0,copy_of_three,three,flag
apple,11100.0,11100.0,False
ball,44400.0,44400.0,False
cerill,,,False
clock,,,True
dancy,,,False


In [81]:
df['one_upper_half']=df['three'][:2]
df

Unnamed: 0,copy_of_three,three,flag,one_upper_half
apple,11100.0,11100.0,False,11100.0
ball,44400.0,44400.0,False,44400.0
cerill,,,False,
clock,,,True,
dancy,,,False,


# use pandas to read datasets

In [82]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [83]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [84]:
type(movies)

pandas.core.frame.DataFrame

In [85]:
type(ser)

pandas.core.series.Series

In [86]:
tags = pd.read_csv("D:/pandas/tags.csv", sep=',')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [87]:
row_0 = tags.iloc[0]
type(row_0)


pandas.core.series.Series

In [88]:
print(row_0)

userId                3
movieId             260
tag             classic
timestamp    1439472355
Name: 0, dtype: object


In [89]:
row_0.index

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [90]:
row_0.name

0

In [91]:
row_0['userId']


3

In [92]:
row_0 = row_0.rename('first_row')
row_0.name

'first_row'

In [93]:
tags.head

<bound method NDFrame.head of          userId  movieId                  tag   timestamp
0             3      260              classic  1439472355
1             3      260               sci-fi  1439472256
2             4     1732          dark comedy  1573943598
3             4     1732       great dialogue  1573943604
4             4     7569     so bad it's good  1573943455
...         ...      ...                  ...         ...
1093355  162521    66934  Neil Patrick Harris  1427311611
1093356  162521   103341     cornetto trilogy  1427311259
1093357  162534   189169               comedy  1527518175
1093358  162534   189169             disabled  1527518181
1093359  162534   189169              robbery  1527518193

[1093360 rows x 4 columns]>

In [94]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

# descriptive statastics

In [95]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [96]:
ratings.mode()

Unnamed: 0,userId,movieId,rating,timestamp
0,72315,356,4.0,825638400


In [97]:
ratings.std()

userId       4.679172e+04
movieId      3.919886e+04
rating       1.060744e+00
timestamp    2.268758e+08
dtype: float64

In [98]:
ratings.min()

userId               1.0
movieId              1.0
rating               0.5
timestamp    789652009.0
dtype: float64

In [99]:
ratings.max()

userId       1.625410e+05
movieId      2.091710e+05
rating       5.000000e+00
timestamp    1.574328e+09
dtype: float64

In [100]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [101]:
filter_1=ratings['rating']>0
filter_1.any()     # if there is any value greater than 0 returns true

True

In [102]:
filter_2=ratings['rating']>0
filter_2.all()   # if all values greater than 0

True

# data cleaning :and handling missing data

In [103]:
movies.shape

(62423, 3)

In [104]:
movies.isnull().any  # is any row is null

<bound method NDFrame._add_numeric_operations.<locals>.any of        movieId  title  genres
0        False  False   False
1        False  False   False
2        False  False   False
3        False  False   False
4        False  False   False
...        ...    ...     ...
62418    False  False   False
62419    False  False   False
62420    False  False   False
62421    False  False   False
62422    False  False   False

[62423 rows x 3 columns]>

In [105]:
ratings.shape

(25000095, 4)

In [106]:
# is any row is null

ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [107]:
tags.shape

(1093360, 4)

In [108]:
#is any row is null

tags.isnull().any()

userId       False
movieId      False
tag           True
timestamp    False
dtype: bool

In [109]:
tags=tags.dropna()

now againg we check if any null value in row or not

In [110]:
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [111]:
tags.shape

(1093344, 4)

In [112]:
import pandas as pd
rating=pd.read_csv("D:/pandas/ratings.csv")


In [113]:
data

[{'a': 1, 'b': 2}, {'z': 100, 'apple': 200, 'red': 300}]

In [114]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [115]:
is_animation = movies['genres'].str.contains('Animation')
# select data which has Animation as a category form genres column
movies[is_animation][5:15]

Unnamed: 0,movieId,title,genres
309,313,"Swan Princess, The (1994)",Animation|Children
359,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
387,392,"Secret Adventures of Tom Thumb, The (1993)",Adventure|Animation
545,551,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
551,558,"Pagemaster, The (1994)",Action|Adventure|Animation|Children|Fantasy
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
586,594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
587,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
588,596,Pinocchio (1940),Animation|Children|Fantasy|Musical
602,610,Heavy Metal (1981),Action|Adventure|Animation|Horror|Sci-Fi


In [116]:
movies[is_animation].head(15)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12,13,Balto (1995),Adventure|Animation|Children
47,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
236,239,"Goofy Movie, A (1995)",Animation|Children|Comedy|Romance
241,244,Gumby: The Movie (1995),Animation|Children
309,313,"Swan Princess, The (1994)",Animation|Children
359,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
387,392,"Secret Adventures of Tom Thumb, The (1993)",Adventure|Animation
545,551,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
551,558,"Pagemaster, The (1994)",Action|Adventure|Animation|Children|Fantasy


# Group by and aggrigate

In [117]:
ratings_count = rating[['movieId','rating']].groupby('rating').count()
ratings_count

Unnamed: 0_level_0,movieId
rating,Unnamed: 1_level_1
0.5,393068
1.0,776815
1.5,399490
2.0,1640868
2.5,1262797
3.0,4896928
3.5,3177318
4.0,6639798
4.5,2200539
5.0,3612474


In [118]:
average_rating = rating[['movieId','rating']].groupby('movieId').mean()
average_rating.tail()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
209157,1.5
209159,3.0
209163,4.5
209169,3.0
209171,3.0


In [119]:
movie_count = rating[['movieId','rating']].groupby('movieId').count()
movie_count.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,57309
2,24228
3,11804
4,2523
5,11714


In [120]:
movie_count = rating[['movieId','rating']].groupby('movieId').count()
movie_count.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,57309
2,24228
3,11804
4,2523
5,11714


# merge dataframe

In [121]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [122]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [123]:
t=movies.merge(tags, on='movieId', how='inner')   #type of join

t.shape

(1093344, 6)

In [124]:
t

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,791,Owned,1515175493
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1048,imdb top 250,1172144394
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1361,Pixar,1216146311
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3164,Pixar,1223304727
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3164,time travel,1223304729
...,...,...,...,...,...,...
1093339,209063,The Prep School Negro (2012),(no genres listed),96399,Philadelphia,1574021180
1093340,209063,The Prep School Negro (2012),(no genres listed),96399,private school,1574021158
1093341,209063,The Prep School Negro (2012),(no genres listed),96399,quaker,1574021197
1093342,209063,The Prep School Negro (2012),(no genres listed),96399,racism,1574021194


In [125]:
f=movies.merge(tags, on='movieId', how='outer')
f.shape

(1110516, 6)

In [126]:
f

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,791.0,Owned,1.515175e+09
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1048.0,imdb top 250,1.172144e+09
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1361.0,Pixar,1.216146e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3164.0,Pixar,1.223305e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3164.0,time travel,1.223305e+09
...,...,...,...,...,...,...
1110511,209157,We (2018),Drama,,,
1110512,209159,Window of the Soul (2001),Documentary,,,
1110513,209163,Bad Poems (2018),Comedy|Drama,,,
1110514,209169,A Girl Thing (2001),(no genres listed),,,


In [127]:
print("\U0001F923")  # to priny emojies

🤣


# combine aggrigation, merging, and filter to get useful analysis 

In [128]:
avg_ratings=rating.groupby('movieId', as_index=False).mean()

# as_index=False won't make movieid as index
del avg_ratings['userId']
avg_ratings.head()

Unnamed: 0,movieId,rating,timestamp
0,1,3.893708,1153152000.0
1,2,3.251527,1122310000.0
2,3,3.142028,980602300.0
3,4,2.853547,942460500.0
4,5,3.058434,1004723000.0


In [129]:
box_office = movies.merge(avg_ratings, on='movieId', how='inner')
box_office.tail()
    

Unnamed: 0,movieId,title,genres,rating,timestamp
59042,209157,We (2018),Drama,1.5,1574281000.0
59043,209159,Window of the Soul (2001),Documentary,3.0,1574281000.0
59044,209163,Bad Poems (2018),Comedy|Drama,4.5,1574285000.0
59045,209169,A Girl Thing (2001),(no genres listed),3.0,1574292000.0
59046,209171,Women of Devil's Island (1962),Action|Adventure|Drama,3.0,1574292000.0


In [130]:
is_highly_rated=box_office['rating']>=4.0

box_office[is_highly_rated][-5:]

Unnamed: 0,movieId,title,genres,rating,timestamp
59027,209121,Adrenalin: The BMW Touring Car Story (2014),Documentary,4.0,1574143000.0
59028,209123,Square Roots: The Story of SpongeBob SquarePan...,Documentary,4.0,1574174000.0
59029,209129,Destination Titan (2011),Documentary,4.5,1574184000.0
59041,209155,Santosh Subramaniam (2008),Action|Comedy|Romance,5.0,1574272000.0
59044,209163,Bad Poems (2018),Comedy|Drama,4.5,1574285000.0


In [131]:
is_comedy=box_office['genres'].str.contains('Comedy')
box_office[is_comedy][:5]

Unnamed: 0,movieId,title,genres,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,1153152000.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,980602300.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,942460500.0
4,5,Father of the Bride Part II (1995),Comedy,3.058434,1004723000.0
6,7,Sabrina (1995),Comedy|Romance,3.363666,978832600.0


In [132]:
box_office[is_comedy & is_highly_rated][-5:]

Unnamed: 0,movieId,title,genres,rating,timestamp
58990,208911,Cheating in Chains (2006),Comedy,4.0,1573858000.0
58998,208939,Klaus (2019),Adventure|Animation|Children|Comedy,4.3125,1574006000.0
59001,208945,Powder (2019),Comedy|Drama,4.5,1573892000.0
59041,209155,Santosh Subramaniam (2008),Action|Comedy|Romance,5.0,1574272000.0
59044,209163,Bad Poems (2018),Comedy|Drama,4.5,1574285000.0


**vectories string operation**

In [133]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [134]:
movie_genres=movies['genres'].str.split('|',expand=True)
movie_genres[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,Adventure,Children,Fantasy,,,,,,,
2,Comedy,Romance,,,,,,,,
3,Comedy,Drama,Romance,,,,,,,
4,Comedy,,,,,,,,,
5,Action,Crime,Thriller,,,,,,,
6,Comedy,Romance,,,,,,,,
7,Adventure,Children,,,,,,,,
8,Action,,,,,,,,,
9,Action,Adventure,Thriller,,,,,,,


basically we use .str.split('', expand=True)  which create colum of bar sepreated value

In [135]:
movie_genres['isComedy']=movies['genres'].str.contains('Comedy')
movie_genres[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,isComedy
0,Adventure,Animation,Children,Comedy,Fantasy,,,,,,True
1,Adventure,Children,Fantasy,,,,,,,,False
2,Comedy,Romance,,,,,,,,,True
3,Comedy,Drama,Romance,,,,,,,,True
4,Comedy,,,,,,,,,,True
5,Action,Crime,Thriller,,,,,,,,False
6,Comedy,Romance,,,,,,,,,True
7,Adventure,Children,,,,,,,,,False
8,Action,,,,,,,,,,False
9,Action,Adventure,Thriller,,,,,,,,False


In [136]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [137]:
movies['year']=movies['title'].str.extract('.*\((.*)\).*', expand=True)


In [138]:
movies.tail()

Unnamed: 0,movieId,title,genres,year
62418,209157,We (2018),Drama,2018
62419,209159,Window of the Soul (2001),Documentary,2001
62420,209163,Bad Poems (2018),Comedy|Drama,2018
62421,209169,A Girl Thing (2001),(no genres listed),2001
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,1962


   # parsing timestamp

In [139]:
tags=pd.read_csv("D:/pandas/ratings.csv", sep=',')


In [140]:
tags.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [141]:
tags.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


# Parsing Timestamps

In [142]:
tags['parsed_time']=pd.to_datetime(tags['timestamp'], unit='s')

In [143]:
tags['parsed_time'].dtype

dtype('<M8[ns]')

In [144]:
tags.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,parsed_time
0,1,296,5.0,1147880044,2006-05-17 15:34:04
1,1,306,3.5,1147868817,2006-05-17 12:26:57


In [145]:
greater_than_t = tags['parsed_time'] > '2015-02-01'

selected_rows = tags[greater_than_t]

tags.shape, selected_rows.shape

((25000095, 5), (7471657, 5))

sorting the table using timestamp

In [146]:
tags.sort_values(by='parsed_time', ascending=True)[:10]

Unnamed: 0,userId,movieId,rating,timestamp,parsed_time
326761,2262,21,3.0,789652009,1995-01-09 11:46:49
326810,2262,1079,3.0,789652009,1995-01-09 11:46:49
326767,2262,47,5.0,789652009,1995-01-09 11:46:49
15845015,102689,1,4.0,822873600,1996-01-29 00:00:00
15845023,102689,39,5.0,822873600,1996-01-29 00:00:00
16940364,109832,32,4.0,822873600,1996-01-29 00:00:00
15845027,102689,47,5.0,822873600,1996-01-29 00:00:00
15845028,102689,50,5.0,822873600,1996-01-29 00:00:00
15845029,102689,52,4.0,822873600,1996-01-29 00:00:00
15845031,102689,58,5.0,822873600,1996-01-29 00:00:00
