###### 1. How to import pandas and check the version?

In [2]:
import pandas as pd

In [3]:
pd.__version__

'0.24.2'

###### 2. How to create a series from a list, numpy array and dict?

In [4]:
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

In [5]:
s1=pd.Series(mylist)
s1

0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object

In [6]:
s2=pd.Series(myarr)
s2

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
dtype: int32

In [7]:
s3=pd.Series(mydict)

In [8]:
s3

a     0
b     1
c     2
e     3
d     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int64

###### 3. How to convert the index of a series into a column of a dataframe?

In [9]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

In [10]:
df = ser.to_frame().reset_index()

In [11]:
df

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


###### 4. How to combine many series to form a dataframe?

In [12]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
con=pd.concat([ser1,ser2],axis=1)
con

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


###### 2nd solution

In [13]:
df=pd.DataFrame({'col1':ser1,'col2':ser2})

In [14]:
df

Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


###### 5. How to assign name to the series’ index?

In [15]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

In [16]:
ser.name='alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

###### 6.From ser1 remove items present in ser2.

In [17]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [22]:
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

###### 7.Get all items of ser1 and ser2 not common to both.

In [23]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [29]:
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

###### 8.Compute the minimum, 25th percentile, median, 75th, and maximum of ser.

In [31]:
ser = pd.Series(np.random.normal(10, 5, 25))
np.percentile(ser,q=[0,25,50,75,100])

array([-0.55317309,  5.76580801, 10.63482462, 15.27967761, 17.85622233])

###### 9.Calculte the frequency counts of each unique value ser.

In [32]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [33]:
ser.value_counts()

c    9
b    4
d    4
f    4
e    4
g    3
a    1
h    1
dtype: int64

##### 10.From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’.

In [34]:
# Input
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

# Solution
print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 Freq: 4    7
2    2
1    2
3    1
dtype: int64


0         4
1         4
2         4
3         2
4         4
5         4
6     Other
7         2
8         4
9     Other
10        4
11    Other
dtype: object

###### 11.Bin the series ser into 10 equal deciles and replace the values with the bin name.

In [35]:
ser = pd.Series(np.random.random(20))

In [36]:
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    5th
1    9th
2    2nd
3    9th
4    6th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

###### 12.Reshape the series ser into a dataframe with 7 rows and 5 columns

In [37]:
ser = pd.Series(np.random.randint(1, 10, 35))
ser

0     5
1     4
2     9
3     1
4     6
5     3
6     1
7     2
8     8
9     1
10    7
11    1
12    5
13    9
14    2
15    6
16    5
17    4
18    4
19    9
20    5
21    2
22    2
23    8
24    8
25    3
26    2
27    7
28    6
29    9
30    2
31    4
32    2
33    9
34    5
dtype: int32

In [39]:
sdf = pd.DataFrame(ser.values.reshape(7,5))

In [40]:
sdf

Unnamed: 0,0,1,2,3,4
0,5,4,9,1,6
1,3,1,2,8,1
2,7,1,5,9,2
3,6,5,4,4,9
4,5,2,2,8,8
5,3,2,7,6,9
6,2,4,2,9,5


###### 13.Find the positions of numbers that are multiples of 3 from ser.

In [5]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    8
1    4
2    8
3    5
4    2
5    9
6    5
dtype: int32

In [4]:
import numpy as np

In [10]:
a=np.argwhere(ser % 3==0)

In [6]:
np.where(ser%3==0)

(array([5], dtype=int64),)

In [7]:
a=np.where(ser%3==0)

In [11]:
a

array([[5]], dtype=int64)

###### 14.From ser, extract the items at positions in list pos.

In [43]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

In [44]:
ser[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

In [45]:
#alternate solution
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

###### 15.Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

In [13]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [15]:
con= pd.concat([ser1,ser2],axis=1)

In [16]:
con

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [17]:
con1= pd.concat([ser1,ser2],axis=0)

In [18]:
con1

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [19]:
#alternate 
ser1.append(ser2)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

###### 16.Get the positions of items of ser2 in ser1 as a list.

In [53]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [61]:
ser1[ser1.isin(ser2)]

0    10
4     3
5     1
8    13
dtype: int64

In [57]:
ser1[ser1.isin(ser2)].index

Int64Index([0, 4, 5, 8], dtype='int64')

In [64]:
ser= list(ser1[ser1.isin(ser2)].index)

In [65]:
ser

[0, 4, 5, 8]

In [62]:
#Solution1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

In [63]:
#Solution2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

###### 17.Compute the mean squared error of truth and predicted series.

In [66]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

In [67]:
np.mean((truth-pred)**2)

0.36659981597819846

###### 18.Change the first character of each word to upper case in each word of ser.

In [68]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

0     how
1      to
2    kick
3    ass?
dtype: object

In [74]:
#Solution 1
pd.Series(i.title() for i in ser)

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [75]:
#Solution2
ser.map(lambda x: x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [76]:
#Solution3
ser.map(lambda x: x[0].upper() + x[1:])

0     How
1      To
2    Kick
3    Ass?
dtype: object

###### 19.How to calculate the number of characters in each word in a series?

In [77]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [78]:
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

###### 20.Difference of differences between the consequtive numbers of ser.

In [80]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [83]:
ser.diff()

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    6.0
7    8.0
dtype: float64

In [84]:
ser.diff().to_list()

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]

In [85]:
ser.diff().diff().to_list()

[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]

###### 21.How to convert a series of date-strings to a timeseries?

In [86]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [87]:
ser

0         01 Jan 2010
1          02-02-2011
2            20120303
3          2013/04/04
4          2014-05-05
5    2015-06-06T12:20
dtype: object

In [90]:
#Solution1
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [91]:
#Solution2
from dateutil.parser import parse
ser.map(lambda x: parse(x))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

###### 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [93]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser

0         01 Jan 2010
1          02-02-2011
2            20120303
3          2013/04/04
4          2014-05-05
5    2015-06-06T12:20
dtype: object

In [99]:
ser_ts= pd.to_datetime(ser)
ser_ts

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [98]:
ser_ts.dt.day.tolist()  
#day of month

[1, 2, 3, 4, 5, 6]

In [100]:
ser_ts.dt.year.tolist()

[2010, 2011, 2012, 2013, 2014, 2015]

In [101]:
ser_ts.dt.dayofyear.tolist()
# day of year 

[1, 33, 63, 94, 125, 157]

In [104]:
ser_ts.dt.dayofweek.tolist()


[4, 2, 5, 3, 0, 5]

In [105]:
ser_ts.dt.weekday_name.tolist()
#day of week

['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']

In [106]:
ser_ts.dt.weekofyear.tolist()
#week number

[53, 5, 9, 14, 19, 23]

###### 23.Change ser to dates that start with 4th of the respective months.

In [113]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
#Solution1
ser=ser.map(lambda x:'4 '+x)

In [114]:
pd.to_datetime(ser)

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

In [115]:
#Solution2
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))

# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
# Format it.
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]

['2010-01-04', '2011-02-04', '2012-03-04']

In [116]:
#Solution3
ser.map(lambda x: parse('04 ' + x))

0   2004-01-04 20:10:00
1   2004-02-04 20:11:00
2   2004-03-04 20:12:00
dtype: datetime64[ns]

###### 24.From ser, extract words that contain atleast 2 vowels.

In [117]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [121]:
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in list('aeiou')]) >= 2)
ser[mask]
#count the value from the list ,get i or 0 and then sum the counted values

0     Apple
1    Orange
4     Money
dtype: object

###### 25.Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference.

In [122]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [130]:
#Solution1
import re
emails.str.findall(pattern, flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

In [132]:
#Solution2
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [134]:
#Solution3
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

###### 26.Compute the mean of weights of each fruit.

In [136]:
#Input
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weight= pd.Series(np.linspace(1, 10, 10))
print(weight.tolist())
print(fruit.tolist())
#> [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
#> ['banana', 'carrot', 'apple', 'carrot', 'carrot', 'apple', 'banana', 'carrot', 'apple', 'carrot']

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'banana', 'banana', 'carrot', 'banana', 'apple', 'banana', 'carrot', 'banana', 'carrot']


In [142]:
weight.groupby(fruit).mean()

apple     3.500000
banana    5.200000
carrot    7.333333
dtype: float64

###### 27.Compute the euclidean distance between series (points) p and q, without using a packaged formula.

In [144]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [145]:
#Solution1
sum((p - q)**2)**.5

18.16590212458495

In [146]:
#Solution2
np.linalg.norm(p-q)

18.16590212458495

###### 28.Get the positions of peaks (values surrounded by smaller values on both sides) in ser.

In [21]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [24]:
l=[]
for i in range(len(ser)-2):
    if(ser[i+1]>ser[i] & ser[i+1]>ser[i+2]):
        l.append(ser[i+1])

In [25]:
l

[10]

###### 29.Replace the spaces in my_str with the least frequent character.

In [26]:
my_str = 'dbc deb abed gade'

In [27]:
s=pd.Series(list(my_str))
f=s.value_counts()
f

d    4
e    3
b    3
     3
a    2
g    1
c    1
dtype: int64

In [34]:
least_freq =  f.tail(1)
s=list(least_freq.index)

In [36]:
s= str(s)

In [38]:
my_str.replace(" ",s)

"dbc['c']deb['c']abed['c']gade"

###### 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [170]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    4
2000-01-08    2
2000-01-15    2
2000-01-22    9
2000-01-29    1
2000-02-05    6
2000-02-12    7
2000-02-19    5
2000-02-26    9
2000-03-04    1
Freq: W-SAT, dtype: int32

###### 31.ser has missing dates and values. Make all missing dates appear and fill up with value from previous date.

In [171]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64


In [172]:
ser.resample('D').ffill()

2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

In [173]:
#Alternate Solution
ser.resample('D').bfill()  # fill with next value
ser.resample('D').bfill().ffill()  # fill next else prev value

2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     3.0
Freq: D, dtype: float64

###### 32.Compute autocorrelations for the first 10 lags of ser. Find out which lag has the largest correlation.

In [41]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))

In [42]:
ser

0     -2.926250
1     14.267369
2     12.991278
3     -9.018849
4     -0.636890
5     16.415375
6      6.360892
7     10.605195
8      7.297860
9     30.757883
10     3.334876
11     3.692554
12    21.671259
13    17.506519
14    15.093871
15    13.327967
16    23.289515
17    20.246678
18    15.704134
19    16.720468
dtype: float64

In [43]:
corr= ser.corr()

TypeError: corr() missing 1 required positional argument: 'other'

###### 33.Import every 50th row of BostonHousing dataset as a dataframe.

In [44]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

In [49]:
df2 = pd.DataFrame()
for chunk in df:
    df2 = df2.append(chunk.iloc[0,:])

In [50]:
df2

Unnamed: 0,age,b,chas,crim,dis,indus,lstat,medv,nox,ptratio,rad,rm,tax,zn
0,65.2,396.9,0.0,0.00632,4.09,2.31,4.98,24.0,0.538,15.3,1.0,6.575,296.0,18.0
50,45.7,395.56,0.0,0.08873,6.8147,5.64,13.45,19.7,0.439,16.8,4.0,5.963,243.0,21.0
100,79.9,394.76,0.0,0.14866,2.7778,8.56,9.42,27.5,0.52,20.9,5.0,6.727,384.0,0.0
150,97.3,372.8,0.0,1.6566,1.618,19.58,14.1,21.5,0.871,14.7,5.0,6.122,403.0,0.0
200,13.9,384.3,0.0,0.01778,7.6534,1.47,4.45,32.9,0.403,17.0,3.0,7.135,402.0,95.0
250,13.0,396.28,0.0,0.1403,7.3967,5.86,5.9,24.4,0.431,19.1,7.0,6.487,330.0,22.0
300,47.4,390.86,0.0,0.04417,7.8278,2.24,6.07,24.8,0.4,14.8,5.0,6.871,358.0,70.0
350,44.4,396.9,0.0,0.06211,8.7921,1.25,5.98,22.9,0.429,19.7,1.0,6.49,335.0,40.0
400,100.0,396.9,0.0,25.0461,1.5888,18.1,26.77,5.6,0.693,20.2,24.0,5.987,666.0,0.0
450,92.6,0.32,0.0,6.71772,2.3236,18.1,17.44,13.4,0.713,20.2,24.0,6.749,666.0,0.0


###### 34.Import the boston housing dataset, but while importing change the 'medv' (median house value) column so that values < 25 becomes ‘Low’ and > 25 becomes ‘High’.

In [54]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})

In [55]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,High


###### 35. How to create a dataframe with rows as strides from a given series?

###### 36. How to import only specified columns from a csv file?

In [59]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',usecols=['crim','medv'])

In [60]:
df.head()

Unnamed: 0,crim,medv
0,0.00632,24.0
1,0.02731,21.6
2,0.02729,34.7
3,0.03237,33.4
4,0.06905,36.2


###### 37.Get the number of rows, columns, datatype and summary statistics of each column of the Cars93 dataset. Also get the numpy array and list equivalent of the dataframe.



In [61]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [62]:
df.shape

(93, 27)

In [63]:
df.dtypes

Manufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city              float64
MPG.highway           float64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower            float64
RPM                   float64
Rev.per.mile          float64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers            float64
Length                float64
Wheelbase             float64
Width                 float64
Turn.circle           float64
Rear.seat.room        float64
Luggage.room          float64
Weight                float64
Origin                 object
Make                   object
dtype: object

In [65]:
df.describe()

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight
count,86.0,91.0,88.0,84.0,91.0,91.0,86.0,90.0,87.0,85.0,91.0,89.0,92.0,87.0,88.0,89.0,74.0,86.0
mean,17.118605,19.616484,21.459091,22.404762,29.065934,2.658242,144.0,5276.666667,2355.0,16.683529,5.076923,182.865169,103.956522,69.448276,38.954545,27.853933,13.986486,3104.593023
std,8.82829,9.72428,10.696563,5.84152,5.370293,1.045845,53.455204,605.554811,486.916616,3.375748,1.045953,14.792651,6.856317,3.778023,3.304157,3.018129,3.120824,600.129993
min,6.7,7.4,7.9,15.0,20.0,1.0,55.0,3800.0,1320.0,9.2,2.0,141.0,90.0,60.0,32.0,19.0,6.0,1695.0
25%,10.825,12.35,14.575,18.0,26.0,1.8,100.75,4800.0,2017.5,14.5,4.0,174.0,98.0,67.0,36.0,26.0,12.0,2647.5
50%,14.6,17.7,19.15,21.0,28.0,2.3,140.0,5200.0,2360.0,16.5,5.0,181.0,103.0,69.0,39.0,27.5,14.0,3085.0
75%,20.25,23.5,24.825,25.0,31.0,3.25,170.0,5787.5,2565.0,19.0,6.0,192.0,110.0,72.0,42.0,30.0,16.0,3567.5
max,45.4,61.9,80.0,46.0,50.0,5.7,300.0,6500.0,3755.0,27.0,8.0,219.0,119.0,78.0,45.0,36.0,22.0,4105.0


In [66]:
# how many columns under each dtype
print(df.get_dtype_counts())
print(df.dtypes.value_counts())

float64    18
object      9
dtype: int64
float64    18
object      9
dtype: int64


In [67]:
# numpy array 
df_arr = df.values


In [68]:
df_arr

array([['Acura', 'Integra', 'Small', ..., 2705.0, 'non-USA',
        'Acura Integra'],
       [nan, 'Legend', 'Midsize', ..., 3560.0, 'non-USA', 'Acura Legend'],
       ['Audi', '90', 'Compact', ..., 3375.0, 'non-USA', 'Audi 90'],
       ...,
       ['Volkswagen', 'Corrado', 'Sporty', ..., 2810.0, 'non-USA',
        'Volkswagen Corrado'],
       ['Volvo', '240', 'Compact', ..., 2985.0, 'non-USA', 'Volvo 240'],
       [nan, '850', 'Midsize', ..., 3245.0, 'non-USA', 'Volvo 850']],
      dtype=object)

In [69]:
# list
df_list = df.values.tolist()

In [70]:
df_list

[['Acura',
  'Integra',
  'Small',
  12.9,
  15.9,
  18.8,
  25.0,
  31.0,
  'None',
  'Front',
  '4',
  1.8,
  140.0,
  6300.0,
  2890.0,
  'Yes',
  13.2,
  5.0,
  177.0,
  102.0,
  68.0,
  37.0,
  26.5,
  nan,
  2705.0,
  'non-USA',
  'Acura Integra'],
 [nan,
  'Legend',
  'Midsize',
  29.2,
  33.9,
  38.7,
  18.0,
  25.0,
  'Driver & Passenger',
  'Front',
  '6',
  3.2,
  200.0,
  5500.0,
  2335.0,
  'Yes',
  18.0,
  5.0,
  195.0,
  115.0,
  71.0,
  38.0,
  30.0,
  15.0,
  3560.0,
  'non-USA',
  'Acura Legend'],
 ['Audi',
  '90',
  'Compact',
  25.9,
  29.1,
  32.3,
  20.0,
  26.0,
  'Driver only',
  'Front',
  '6',
  2.8,
  172.0,
  5500.0,
  2280.0,
  'Yes',
  16.9,
  5.0,
  180.0,
  102.0,
  67.0,
  37.0,
  28.0,
  14.0,
  3375.0,
  'non-USA',
  'Audi 90'],
 ['Audi',
  '100',
  'Midsize',
  nan,
  37.7,
  44.6,
  19.0,
  26.0,
  'Driver & Passenger',
  nan,
  '6',
  nan,
  172.0,
  5500.0,
  2535.0,
  nan,
  21.1,
  6.0,
  193.0,
  106.0,
  nan,
  37.0,
  31.0,
  17.0,
  3405.0,
