**To do data exploration with Pandas, there are several methods and attributes available. Here I will write down some of the most common methods and attributes.**

In [1]:
import pandas as pd

In [2]:
dict1 = {"name" : ["furkan", "semih", "sibel"], "age" : [26,33,35], "location" : ["ıstanbul", "ankara", "malatya"]}

In [3]:
dict1

{'name': ['furkan', 'semih', 'sibel'],
 'age': [26, 33, 35],
 'location': ['ıstanbul', 'ankara', 'malatya']}

In [5]:
df = pd.DataFrame(dict1)
df

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      3 non-null      object
 1   age       3 non-null      int64 
 2   location  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 112.0+ bytes


In [7]:
df.describe()

Unnamed: 0,age
count,3.0
mean,31.333333
std,4.725816
min,26.0
25%,29.5
50%,33.0
75%,34.0
max,35.0


In [8]:
df.values

array([['furkan', 26, 'ıstanbul'],
       ['semih', 33, 'ankara'],
       ['sibel', 35, 'malatya']], dtype=object)

In [9]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [10]:
df.head()

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [11]:
df.tail()

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [13]:
df.columns

Index(['name', 'age', 'location'], dtype='object')

In [16]:
df.rename(columns = {"name" : "names"})

Unnamed: 0,names,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [17]:
df

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [18]:
df.shape

(3, 3)

In [19]:
df.sort_values("age")

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [20]:
df.sort_values("age", ascending = False)

Unnamed: 0,name,age,location
2,sibel,35,malatya
1,semih,33,ankara
0,furkan,26,ıstanbul


In [22]:
df.sort_values(["name" , "location"], ascending = [False, True])

Unnamed: 0,name,age,location
2,sibel,35,malatya
1,semih,33,ankara
0,furkan,26,ıstanbul


In [23]:
df["name"]

0    furkan
1     semih
2     sibel
Name: name, dtype: object

In [24]:
df[["name"]]

Unnamed: 0,name
0,furkan
1,semih
2,sibel


In [25]:
df[df.name == "furkan"]

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul


In [28]:
df[df.age == 33]

Unnamed: 0,name,age,location
1,semih,33,ankara


In [29]:
df[df.age > 30]

Unnamed: 0,name,age,location
1,semih,33,ankara
2,sibel,35,malatya


In [30]:
df.age > 30

0    False
1     True
2     True
Name: age, dtype: bool

In [32]:
df[df["location"].isin(["ankara", "malatya"])]

Unnamed: 0,name,age,location
1,semih,33,ankara
2,sibel,35,malatya


**We can sort dataframes by any column we desire, we can do multiple sorting operations on different columns as well. After that we can ask questions like "Which are the top 5 profiting companies in our dataframe ? , Which job pays least ? , Which 
countries have oldest citizens ? etc..." It is a powerfull tool for us to find out least and maximum values, and then we can filter them out, we can do further analysis on them etc.** 


In [33]:
df

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul
1,semih,33,ankara
2,sibel,35,malatya


In [40]:
df[((df.location == "ankara") | (df.location == "ıstanbul")) & (df.age > 30 )]

Unnamed: 0,name,age,location
1,semih,33,ankara


In [42]:
df[(df.age > 25) & (df.location == "ıstanbul")]

Unnamed: 0,name,age,location
0,furkan,26,ıstanbul


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      3 non-null      object
 1   age       3 non-null      int64 
 2   location  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 112.0+ bytes


In [44]:
df.dtypes

name        object
age          int64
location    object
dtype: object

In [48]:
df["location"] = df["location"].astype(str)

In [49]:
df.dtypes

name        object
age          int64
location    object
dtype: object

In [52]:
df.age = df["age"].astype(float)

In [53]:
df

Unnamed: 0,name,age,location
0,furkan,26.0,ıstanbul
1,semih,33.0,ankara
2,sibel,35.0,malatya


In [54]:
locations = ["malatya", "ankara", "antalya", "konya"]

In [55]:
df["location"].isin(locations)

0    False
1     True
2     True
Name: location, dtype: bool

In [56]:
mask = df["location"].isin(locations)

In [57]:
df[mask]

Unnamed: 0,name,age,location
1,semih,33.0,ankara
2,sibel,35.0,malatya


**When subsetting rows based on categorical columns, we use " .isin() " method. It makes subsetting a breeze. We can first create a list which includes categories we want to subset, after that we can use .isin() method in order to filter out data
we need.**

In [2]:
d2 = {"age" : [40,45,24,32,18,22,60,78,90], "height" : [178,180,155,179,190,188, 166,169,170]}
d2

{'age': [40, 45, 24, 32, 18, 22, 60, 78, 90],
 'height': [178, 180, 155, 179, 190, 188, 166, 169, 170]}

In [5]:
df2 = pd.DataFrame(d2)

In [6]:
df2

Unnamed: 0,age,height
0,40,178
1,45,180
2,24,155
3,32,179
4,18,190
5,22,188
6,60,166
7,78,169
8,90,170


In [7]:
df2.age.mean()

45.44444444444444

In [8]:
df2.height.mean()

175.0

In [9]:
df2.age.median()

40.0

In [10]:
df2.height.median()

178.0

In [12]:
df2.describe()

Unnamed: 0,age,height
count,9.0,9.0
mean,45.444444,175.0
std,25.569079,11.101802
min,18.0,155.0
25%,24.0,169.0
50%,40.0,178.0
75%,60.0,180.0
max,90.0,190.0


In [13]:
df2.age.var()

653.7777777777778

In [14]:
df2.age.std()

25.56907854768681

In [15]:
df2.height.std()

11.10180165558726

In [16]:
df2.age.sum()

409

In [17]:
df2.height.sum()

1575

In [19]:
df2.age.quantile(0.25)

24.0

In [20]:
df2.height.quantile(0.75)

180.0

In [21]:
df2.age.cumsum()

0     40
1     85
2    109
3    141
4    159
5    181
6    241
7    319
8    409
Name: age, dtype: int64

In [22]:
df2.age.cummax()

0    40
1    45
2    45
3    45
4    45
5    45
6    60
7    78
8    90
Name: age, dtype: int64

In [23]:
df2

Unnamed: 0,age,height
0,40,178
1,45,180
2,24,155
3,32,179
4,18,190
5,22,188
6,60,166
7,78,169
8,90,170


In [26]:
df3 = df2.drop_duplicates(subset = "age")

In [29]:
df2.age.value_counts()

78    1
45    1
60    1
24    1
90    1
40    1
22    1
18    1
32    1
Name: age, dtype: int64

In [39]:
d4 = {"name" : ["furkan", "semih", "sibel", "furkan", "furkan", "semih"] , "age": [26,33,35,26,26,33]}

In [40]:
df4 = pd.DataFrame(d4)

In [41]:
df4

Unnamed: 0,name,age
0,furkan,26
1,semih,33
2,sibel,35
3,furkan,26
4,furkan,26
5,semih,33


In [42]:
df4.name.value_counts()

furkan    3
semih     2
sibel     1
Name: name, dtype: int64

In [43]:
df4.age.value_counts(sort = True, ascending = False)

26    3
33    2
35    1
Name: age, dtype: int64

In [44]:
df4.name.value_counts(normalize = True)

furkan    0.500000
semih     0.333333
sibel     0.166667
Name: name, dtype: float64

In [45]:
df4

Unnamed: 0,name,age
0,furkan,26
1,semih,33
2,sibel,35
3,furkan,26
4,furkan,26
5,semih,33


In [47]:
df4.groupby("name").mean()

Unnamed: 0_level_0,age
name,Unnamed: 1_level_1
furkan,26
semih,33
sibel,35


In [48]:
df4["weight"] = [88,78,68,55,66,90]

In [49]:
df4

Unnamed: 0,name,age,weight
0,furkan,26,88
1,semih,33,78
2,sibel,35,68
3,furkan,26,55
4,furkan,26,66
5,semih,33,90


In [51]:
df4.groupby("name").mean()

Unnamed: 0_level_0,age,weight
name,Unnamed: 1_level_1,Unnamed: 2_level_1
furkan,26.0,69.666667
semih,33.0,84.0
sibel,35.0,68.0


In [52]:
df4.groupby("age")["weight"].mean()

age
26    69.666667
33    84.000000
35    68.000000
Name: weight, dtype: float64

In [55]:
import numpy as np

In [57]:
df4.groupby("name").agg([min, max, sum, np.mean])

Unnamed: 0_level_0,age,age,age,age,weight,weight,weight,weight
Unnamed: 0_level_1,min,max,sum,mean,min,max,sum,mean
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
furkan,26,26,78,26,55,88,209,69.666667
semih,33,33,66,33,78,90,168,84.0
sibel,35,35,35,35,68,68,68,68.0


In [58]:
df4.groupby(["name", "age"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
name,age,Unnamed: 2_level_1
furkan,26,69.666667
semih,33,84.0
sibel,35,68.0


In [65]:
df4

Unnamed: 0,name,age,weight
0,furkan,26,88
1,semih,33,78
2,sibel,35,68
3,furkan,26,55
4,furkan,26,66
5,semih,33,90


In [69]:
df4 = df4.append({"name" : "furkan", "age" : 27, "weight" : 73}, ignore_index = True)

In [70]:
df4

Unnamed: 0,name,age,weight
0,furkan,26,88
1,semih,33,78
2,sibel,35,68
3,furkan,26,55
4,furkan,26,66
5,semih,33,90
6,furkan,27,73


In [71]:
df4.groupby(["name", "age"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
name,age,Unnamed: 2_level_1
furkan,26,69.666667
furkan,27,73.0
semih,33,84.0
sibel,35,68.0


In [74]:
df4.groupby("name")[["age", "weight"]].agg([min, max, sum])

Unnamed: 0_level_0,age,age,age,weight,weight,weight
Unnamed: 0_level_1,min,max,sum,min,max,sum
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
furkan,26,27,105,55,88,282
semih,33,33,66,78,90,168
sibel,35,35,35,68,68,68


In [75]:
df4.pivot_table(index = "name", values = ["age", "weight"], aggfunc = [min, max, sum])

Unnamed: 0_level_0,min,min,max,max,sum,sum
Unnamed: 0_level_1,age,weight,age,weight,age,weight
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
furkan,26,55,27,88,105,282
semih,33,78,33,90,66,168
sibel,35,68,35,68,35,68


In [2]:
d6 = {"name" : ["furkan", "semih", "sibel", "ahmet", "sengul"], "age" : [26,33,35,61,55], 
      "location": ["istanbul", "istanbul","malatya", "malatya", "ankara"]}

In [5]:
df6 = pd.DataFrame(d6)
df6

Unnamed: 0,name,age,location
0,furkan,26,istanbul
1,semih,33,istanbul
2,sibel,35,malatya
3,ahmet,61,malatya
4,sengul,55,ankara


In [7]:
df6.set_index("name")

Unnamed: 0_level_0,age,location
name,Unnamed: 1_level_1,Unnamed: 2_level_1
furkan,26,istanbul
semih,33,istanbul
sibel,35,malatya
ahmet,61,malatya
sengul,55,ankara


In [8]:
df6.reset_index()

Unnamed: 0,index,name,age,location
0,0,furkan,26,istanbul
1,1,semih,33,istanbul
2,2,sibel,35,malatya
3,3,ahmet,61,malatya
4,4,sengul,55,ankara


In [14]:
df6

Unnamed: 0,name,age,location
0,furkan,26,istanbul
1,semih,33,istanbul
2,sibel,35,malatya
3,ahmet,61,malatya
4,sengul,55,ankara


In [16]:
df7 = df6.set_index("name")
df7

Unnamed: 0_level_0,age,location
name,Unnamed: 1_level_1,Unnamed: 2_level_1
furkan,26,istanbul
semih,33,istanbul
sibel,35,malatya
ahmet,61,malatya
sengul,55,ankara


In [17]:
df7.sort_index()

Unnamed: 0_level_0,age,location
name,Unnamed: 1_level_1,Unnamed: 2_level_1
ahmet,61,malatya
furkan,26,istanbul
semih,33,istanbul
sengul,55,ankara
sibel,35,malatya


In [9]:
list1 = [ {"name" : "furkan", "surname" : "guner" , "age" : 26}, {"name" : "semih", "surname" : "guner", "age" : 33} ]

In [10]:
df10 = pd.DataFrame(list1)

In [11]:
df10

Unnamed: 0,name,surname,age
0,furkan,guner,26
1,semih,guner,33


In [5]:
dict1 = {"name" : ["furkan", "semih"] , "surname" : ["guner" , "guner"], "age" : [26,33]}

In [7]:
df11 = pd.DataFrame(dict1)

In [8]:
df11

Unnamed: 0,name,surname,age
0,furkan,guner,26
1,semih,guner,33


In [12]:
df11.to_csv("trial.csv")

In [15]:
df12 = pd.read_csv("trial.csv", index_col = 0)

In [16]:
df12

Unnamed: 0,name,surname,age
0,furkan,guner,26
1,semih,guner,33
