In [26]:
import pandas as pd

# this reads csv file
# use chunksize to handle with large (50GB+) datasets
# however, chunksize does not returns a DataFrame, but instead an object
# to operate with

df_chunk = pd.read_csv('iris.csv', index_col=0, chunksize=10) 

chunk_list = []

for chunk in df_chunk:
    # pre-process operations
    # ...
    
    chunk_list.append(chunk)
    
df = pd.concat(chunk_list)

# after this, to save computing time one can remove unimportanted columns
# also is possible to read specific columns 

df_example = pd.read_csv('iris.csv', usecols=['Species'])

In [13]:
# next, change the types for more memory save
df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [22]:
df['SepalLengthCm'] = df['SepalLengthCm'].astype('float64') # not change but one could
df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [5]:
df['SepalLengthCm'].head(10) # THE RIGHT WAY

Id
1     5.1
2     4.9
3     4.7
4     4.6
5     5.0
6     5.4
7     4.6
8     5.0
9     4.4
10    4.9
Name: SepalLengthCm, dtype: float64

In [4]:
df.SepalLengthCm.head(10) # DO NOT DO LIKE, THERE IS NO NEED IN USING BOTH WAYS

Id
1     5.1
2     4.9
3     4.7
4     4.6
5     5.0
6     5.4
7     4.6
8     5.0
9     4.4
10    4.9
Name: SepalLengthCm, dtype: float64

In [27]:
# get some info of the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
SepalLengthCm    150 non-null float64
SepalWidthCm     150 non-null float64
PetalLengthCm    150 non-null float64
PetalWidthCm     150 non-null float64
Species          150 non-null object
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [28]:
df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [28]:
df.iloc[0] # retrives by indices 
df.iloc[0,0]

5.1

In [40]:
df.loc[0:5,'SepalWidthCm'] # retrieves by name

Id
1    3.5
2    3.0
3    3.2
4    3.1
5    3.6
Name: SepalWidthCm, dtype: float64

In [55]:
df.groupby('Species').agg({'SepalLengthCm' : ['sum', 'max']}) # custom sumarizes the data

Unnamed: 0_level_0,SepalLengthCm,SepalLengthCm
Unnamed: 0_level_1,sum,max
Species,Unnamed: 1_level_2,Unnamed: 2_level_2
Iris-setosa,250.3,5.8
Iris-versicolor,296.8,7.0
Iris-virginica,329.4,7.9


In [32]:
df.sort_values("SepalLengthCm", ascending=False) # sort by columns

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
132,7.9,3.8,6.4,2.0,Iris-virginica
136,7.7,3.0,6.1,2.3,Iris-virginica
123,7.7,2.8,6.7,2.0,Iris-virginica
118,7.7,3.8,6.7,2.2,Iris-virginica
119,7.7,2.6,6.9,2.3,Iris-virginica
106,7.6,3.0,6.6,2.1,Iris-virginica
131,7.4,2.8,6.1,1.9,Iris-virginica
108,7.3,2.9,6.3,1.8,Iris-virginica
126,7.2,3.2,6.0,1.8,Iris-virginica
110,7.2,3.6,6.1,2.5,Iris-virginica
