#### Import panda and numpy library

In [1]:
import numpy as np
import pandas as pd

#### check the panda version

In [3]:
print(pd.__version__)
print(pd.show_versions(as_json=True))

1.4.4




{
  "system": {
    "commit": "ca60aab7340d9989d9428e11a51467658190bb6b",
    "python": "3.9.13.final.0",
    "python-bits": 64,
    "OS": "Windows",
    "OS-release": "10",
    "Version": "10.0.22000",
    "machine": "AMD64",
    "processor": "Intel64 Family 6 Model 140 Stepping 1, GenuineIntel",
    "byteorder": "little",
    "LC_ALL": null,
    "LANG": null,
    "LOCALE": {
      "language-code": "English_India",
      "encoding": "1252"
    }
  },
  "dependencies": {
    "pandas": "1.4.4",
    "numpy": "1.21.5",
    "pytz": "2022.1",
    "dateutil": "2.8.2",
    "setuptools": "63.4.1",
    "pip": "22.2.2",
    "Cython": "0.29.32",
    "pytest": "7.1.2",
    "hypothesis": null,
    "sphinx": "5.0.2",
    "blosc": null,
    "feather": null,
    "xlsxwriter": "3.0.3",
    "lxml.etree": "4.9.1",
    "html5lib": null,
    "pymysql": null,
    "psycopg2": null,
    "jinja2": "2.11.3",
    "IPython": "7.31.1",
    "pandas_datareader": null,
    "bs4": "4.11.1",
    "bottleneck": "1.3.5",


#### How to create a series from a list, numpy array and dict?

In [6]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

sr_mylist =pd.Series(mylist)
sr_myarr =pd.Series(myarr)
sr_mydict =pd.Series(mydict)


In [7]:
sr_mylist.head()

0    a
1    b
2    c
3    e
4    d
dtype: object

#### How to convert the index of a series into a column of a dataframe?

In [10]:
df=sr_mylist.to_frame().reset_index()
df.head()

Unnamed: 0,index,0
0,0,a
1,1,b
2,2,c
3,3,e
4,4,d


####  How to combine many series to form a dataframe?

In [11]:
df1= sr_myarr.to_frame().reset_index()
df1.head()

Unnamed: 0,index,0
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [16]:
df_merge= pd.concat([sr_mylist, sr_myarr], axis=1)
df_merge.reset_index()

Unnamed: 0,index,0,1
0,0,a,0
1,1,b,1
2,2,c,2
3,3,e,3
4,4,d,4
5,5,f,5
6,6,g,6
7,7,h,7
8,8,i,8
9,9,j,9


#### How to assign name to the series’ index?

In [18]:
df_merge.columns=["alphabet","number"]
df_merge.head()


Unnamed: 0,alphabet,number
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


#### How to get the items of series A not present in series B?

In [24]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser3= ser1[ ~(ser1.isin(ser2))]
ser3


0    1
1    2
2    3
dtype: int64

#### How to get the items not common to both series A and series B?

In [28]:
ser_u=np.union1d(ser1, ser2)
ser_inter= np.intersect1d(ser1, ser2)
ser_final= ser_u[ ~pd.Series(ser_u).isin( pd.Series(ser_inter))]
ser_final

array([1, 2, 3, 6, 7, 8], dtype=int64)

#### How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [31]:
print(np.min(ser1))
print(np.percentile(ser1, 25))
print(np.median(ser1))
print(np.percentile(ser1, 75))
print(np.max(ser1))

state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))

# Solution
np.percentile(ser, q=[0, 25, 50, 75, 100])

1
2.0
3.0
4.0
5


array([ 1.25117263,  7.70986507, 10.92259345, 13.36360403, 18.0949083 ])

#### How to get frequency counts of unique items of a series?

In [34]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.value_counts()

e    8
f    7
h    4
g    4
c    2
d    2
a    2
b    1
dtype: int64

#### How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [54]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
ser.value_counts()
print(ser[ser.value_counts().index[:2]])
ser[~ser.isin(ser.value_counts().index[:2])]="others"
ser
 

4    4
2    4
dtype: int32


0     others
1          2
2          4
3          2
4          4
5     others
6          4
7     others
8          4
9     others
10         2
11         4
dtype: object

#### How to bin a numeric series to 10 groups of equal size?

In [58]:
ser = pd.Series(np.random.random(20))
ser1=pd.qcut(ser,q=[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1] , 
             labels=["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th"]
        )
ser1.head()


0    10th
1     1st
2     4th
3     8th
4     2nd
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

#### How to convert a numpy array to a dataframe of given shape? (L1) (7*5)

In [61]:
ser = pd.Series(np.random.randint(1, 10, 35))
df=pd.DataFrame(ser.values.reshape(7,5))
df.shape

(7, 5)

#### How to extract items at given positions from a series

In [68]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
ser[pos]


0     a
4     e
8     i
14    o
20    u
dtype: object

#### How to stack two series vertically and horizontally ?

In [78]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
#horizontal
df= (pd.concat([ser1, ser2], axis=1))
print(df.head())
#vertical
ser3= (pd.concat([ser1, ser2], axis=0))
print(ser3)

   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e
0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object


#### How to get the positions of items of series A in another series B?

In [83]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

#### How to compute the mean squared error on a truth and predicted series?

In [85]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
np.mean((truth-pred)**2)

0.28511476729114626

#### How to convert the first character of each element in a series to uppercase?

In [88]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
print([ (i[0].upper()+i[1:]) for i in ser  ])
ser.map(lambda x: x.title())

['How', 'To', 'Kick', 'Ass?']


0     How
1      To
2    Kick
3    Ass?
dtype: object

#### How to calculate the number of characters in each word in a series?

In [93]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
(ser.map(lambda x: len(x))).to_list()

[3, 2, 4, 4]

####  How to compute difference of differences between consequtive numbers of a series?

In [97]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


#### How to convert a series of date-strings to a timeseries?

In [98]:
import datetime as dt

In [99]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser.map(lambda x: pd.to_datetime(x) )

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

####  How to get the day of month, week number, day of year and day of week from a series of date strings?

In [106]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser_datetime= pd.to_datetime(ser)
print(ser_datetime.dt.day.tolist())
print(ser_datetime.dt.weekday.tolist())
print(ser_datetime.dt.day_of_year.tolist())
print(ser_datetime.dt.strftime('%A').tolist())
print(ser_datetime.dt.strftime('%a').tolist())
print(ser_datetime.dt.strftime('%d').tolist())

[1, 2, 3, 4, 5, 6]
[4, 2, 5, 3, 0, 5]
[1, 33, 63, 94, 125, 157]
['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']
['Fri', 'Wed', 'Sat', 'Thu', 'Mon', 'Sat']
['01', '02', '03', '04', '05', '06']


#### How to convert year-month string to dates corresponding to the 4th day of the month?

In [122]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
ser_dt=pd.to_datetime(ser)
pd.to_datetime( (ser_dt.dt.year).astype(str)+"-"+(ser_dt.dt.month).astype(str) +"-"+(ser_dt.dt.day+3).astype(str))


0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

#### How to filter words that contain atleast 2 vowels from a series?

In [124]:
from collections import Counter

In [130]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]



0     Apple
1    Orange
4     Money
dtype: object

#### How to filter valid emails from a series?

In [134]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
import re
emails.str.findall(pattern, re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

#### How to get the mean of a series grouped by another series?

In [142]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())
weights.groupby(fruit).mean()

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['carrot', 'banana', 'carrot', 'carrot', 'apple', 'carrot', 'banana', 'apple', 'banana', 'carrot']


apple     6.5
banana    6.0
carrot    4.8
dtype: float64

#### How to compute the euclidean distance between two series?

In [148]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

sum((p-q)**2 )**.5

np.linalg.norm(p-q)

18.16590212458495

#### How to find all the local maxima (or peaks) in a numeric series?

In [153]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
#print(np.sign(np.diff(ser)))
dd=np.diff( np.sign(np.diff(ser)))
np.where(dd==-2)[0]+1

array([1, 5, 7], dtype=int64)

#### How to replace missing spaces in a string with the least frequent character?

In [176]:
my_str = 'dbc deb abed gade'
ll= pd.Series(list(my_str))
prileast_var=(ll.value_counts().dropna().index[-1])
"".join( ll.replace(" ", prileast_var))

'dbcgdebgabedggade'

#### How to compute the autocorrelations of a numeric series?

In [185]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
auto=[ser.autocorr(i) for i in range(11)]
np.argmax(np.abs(auto[1:]))+1

3

#### How to import only every nth row from a csv file to create a dataframe?

In [192]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)
df_1=pd.DataFrame()
for ch in df:
    df_1=df_1.append(ch.iloc[0,:])
df_1.shape

  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])
  df_1=df_1.append(ch.iloc[0,:])


(11, 14)

#### How to change column values when importing csv to a dataframe?

In [196]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
                 converters={'medv' : lambda  x:  "Low" if float(x)<25 else "High"}
                 )
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,High


In [194]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
df['medv']=df.medv.apply(lambda  x : "Low" if x<25 else "High")
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,High
