# Example (Pandas Data Types)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Data/Students/students.csv")
df

Unnamed: 0,Student,zipcode,grade,birthday,credits
0,Smith,47803,B+,01/01/2000,160
1,Lee,47803,B,01/01/2002,150
2,Rao,60733,B+,01/01/2001,180
3,Chen,30700,C,01/01/2001,150
4,Ibu,50705,A,01/01/2003,150


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Student   5 non-null      object
 1   zipcode   5 non-null      int64 
 2   grade     5 non-null      object
 3   birthday  5 non-null      object
 4   credits   5 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 328.0+ bytes


#### in Pandas, categorical data types can be used either for nominal or ordinal attributes.

### Memory usage

In [11]:
s = pd.Series(['apple','banana','orange']*1000)
s.head(10) #there is repeat

0     apple
1    banana
2    orange
3     apple
4    banana
5    orange
6     apple
7    banana
8    orange
9     apple
dtype: object

In [12]:
s.nbytes

24000

In [14]:
s.dtypes

dtype('O')

In [15]:
s = s.astype('category')
s.dtypes

CategoricalDtype(categories=['apple', 'banana', 'orange'], ordered=False)

In [24]:
s.nbytes #repitition->3 types

10000

In [25]:
s = pd.Series(['ID_%d' % i for i in range(1000)])
s.head(10)

0    ID_0
1    ID_1
2    ID_2
3    ID_3
4    ID_4
5    ID_5
6    ID_6
7    ID_7
8    ID_8
9    ID_9
dtype: object

In [26]:
s.dtypes

dtype('O')

In [27]:
s.nbytes

8000

In [28]:
s = s.astype('category')
s.dtypes

CategoricalDtype(categories=['ID_0', 'ID_1', 'ID_10', 'ID_100', 'ID_101', 'ID_102',
                  'ID_103', 'ID_104', 'ID_105', 'ID_106',
                  ...
                  'ID_990', 'ID_991', 'ID_992', 'ID_993', 'ID_994', 'ID_995',
                  'ID_996', 'ID_997', 'ID_998', 'ID_999'],
                 ordered=False)

In [29]:
s.nbytes #unique->many category objects

10000

In [31]:
df #nominal

Unnamed: 0,Student,zipcode,grade,birthday,credits
0,Smith,47803,B+,01/01/2000,160
1,Lee,47803,B,01/01/2002,150
2,Rao,60733,B+,01/01/2001,180
3,Chen,30700,C,01/01/2001,150
4,Ibu,50705,A,01/01/2003,150


In [32]:
df.sort_values('Student')

Unnamed: 0,Student,zipcode,grade,birthday,credits
3,Chen,30700,C,01/01/2001,150
4,Ibu,50705,A,01/01/2003,150
1,Lee,47803,B,01/01/2002,150
2,Rao,60733,B+,01/01/2001,180
0,Smith,47803,B+,01/01/2000,160


In [33]:
df.Student.min() #alphabetical order

'Chen'

In [34]:
df.Student.max() #alphabetical order

'Smith'

In [36]:
df.zipcode.sort_values()

3    30700
0    47803
1    47803
4    50705
2    60733
Name: zipcode, dtype: int64

In [38]:
df.zipcode.mode() #nominal->only mode make sense

0    47803
dtype: int64

In [40]:
df.grade.sort_values() #ordinal:B>B+ -> wrong

4     A
1     B
0    B+
2    B+
3     C
Name: grade, dtype: object

In [42]:
 df['grade'] = pd.Categorical(df.grade, 
                             categories = ['F','D-','D+','C-','C','C+','B-','B','B+','A-','A','A+'],
                             ordered=True)

In [43]:
df.dtypes

Student       object
zipcode        int64
grade       category
birthday      object
credits        int64
dtype: object

In [46]:
df.sort_values('grade',ascending = False)

Unnamed: 0,Student,zipcode,grade,birthday,credits
4,Ibu,50705,A,01/01/2003,150
0,Smith,47803,B+,01/01/2000,160
2,Rao,60733,B+,01/01/2001,180
1,Lee,47803,B,01/01/2002,150
3,Chen,30700,C,01/01/2001,150


In [47]:
df.grade.mode()

0    B+
Name: grade, dtype: category
Categories (12, object): [F < D- < D+ < C- ... B+ < A- < A < A+]

In [49]:
grade_median = 'B+'
print('median =', grade_median)

median = B+


In [51]:
df.dtypes #interval

Student       object
zipcode        int64
grade       category
birthday      object
credits        int64
dtype: object

In [52]:
df['birthday'] = pd.to_datetime(df.birthday)
df.dtypes

Student             object
zipcode              int64
grade             category
birthday    datetime64[ns]
credits              int64
dtype: object

In [54]:
pandas_epoch = pd.to_datetime(0, unit = 'ns') #birthday before epoch might be negative: interval is allowed to be negative
print('Pandas epoch =', pandas_epoch)

Pandas epoch = 1970-01-01 00:00:00


In [55]:
pd.to_datetime(-10000, unit = 'ns')

Timestamp('1969-12-31 23:59:59.999990')

In [56]:
df.birthday.mode()

0   2001-01-01
dtype: datetime64[ns]

In [57]:
df.birthday.astype('int64') #convert to integers

0     946684800000000000
1    1009843200000000000
2     978307200000000000
3     978307200000000000
4    1041379200000000000
Name: birthday, dtype: int64

In [59]:
pd.to_datetime(df.birthday.astype('int64').mean()) #do not need tp convert

Timestamp('2001-05-26 19:12:00')

In [60]:
df.birthday.mean() 

Timestamp('2001-05-26 19:12:00')

In [62]:
pd.to_datetime(df.birthday.astype('int64').median()) #median can only use this

Timestamp('2001-01-01 00:00:00')

In [63]:
df.credits.mean() 

158.0

In [64]:
df.credits.mode() 

0    150
dtype: int64

In [66]:
df.credits.median() 

150.0

In [67]:
100*(df.credits.max()-df.credits.mean())/df.credits.mean()

13.924050632911392