### Miscellaneous Commands

In [None]:
df.shape
df.columns
df.info()
df.describe()
df.describe(include='object')
df.nunique()
df.isna().sum()
df.agg(['mean','median','sum','count','min','max']).transpose()
df.dtypes.reset_index() #--Series
df.dtypes.rename(columns = {"index" : "column",0 : "Dtype"})
df['Airline'].value_counts(normalize=True)
type(df)
len(df)
df.head()
df.tail()
df.sample(5)
df.sample(frac=0.1, random_state = 529)
df.to_csv("", index=False)
df.drop[[cols],axis=1]

pd.read_csv("path", nrows=1000) #-- for parquet nrows doesn't work

### Subsetting

In [None]:
df.columns[:5]
df.select_dtypes('int')
df[df.columns[:5]]
df[df.columns[::5]] #--every 5th column
[x for x in df.columns if 'Time' in x]
df['Airline'] #--Series
df[['Airline']] #--DataFrame

### Filtering 
- loc, iloc, np.where, query

In [None]:
df.iloc[1,3]
df.loc[:,['Airline']]
df.loc[df['Airline']=='Spirit Air']
df.loc[(f1) & (f2)]
df.loc[~((f1) & (f2))]
df.query('Deptime>1130')
df.query('(f1) & (f2)')

np.where(df['col1'].isnull(), df['col2'], df['col1'])
#       (    condition       ,    Yes    ,     No   )

#### Summarizing

In [None]:
df['time'].sum()
df['time'].count()
df['time'].mean()
df['time'].median()
df['time'].min()
df['time'].max()
df['time'].std()
df['time'].var()
df['time'].quantile([0.25,0.75])

df[[cols]].agg(['min','max']).tanspose()
df[[cols]].agg({'col1':['mean','max'], 'col2':['mean']}).transpose()

#### Category Variables

In [None]:
df['col1'].unique()
df['col1'].nunique()
df['col1'].drop_duplicates()
df['col1'].value_counts()
df['col1'].value_counts().to_frame()
df['col1'].value_counts().reset_index()
df['col1'].clip(1000,2000)    #--> <1000 equals to 1000, >2000 equals 2000

#### Group By

In [None]:
df.groupby('Airline')[['DepDelay', 'ArrDelay']].mean()
                                               .agg(['min','max'])
                                               .transform('sum')

- **agg()** → Reduces data → Group-level result  
- **transform()** → Keeps original size → Row-level aligned result

#### Sort Values

In [None]:
df.sort_values(['ArrDelay'],ascending=False)
df.sort_index()
df.reset_index(drop=True)

#### Handling Missing Data

In [None]:
df['col1'].isna()
df['col1'].isna().sum()
df['col1'].dropna()
df['col1'].fillna()
df['col1'].fillna(df['ArrDelay'].mean())

#### Combine data 

In [None]:
df.query()
df.reset_indes(drop=True)  #--> drops old index not resulted one
df_stack = pd.concat([df1,df2]) #--> axis = 0, union ÷
df_side = pd.concat([df1, df2], axis=1) #--> join on index values ⋮|⋮

In [None]:
#### Merge data

pd.merge([df1, df2], how='left', on=['Airlines'], suffixes = ('_dep', '_arr'))

| Aspect         | `concat()`                                                   | `merge()`                                 |
| -------------- | ------------------------------------------------------------ | ----------------------------------------- |
| Syntax         | `pd.concat()`                                                | `pd.merge()` or `df.merge()`              |
| Join Basis     | Joins by **axis (0 or 1)** and aligns using **index values** | Joins using **specified column(s)/keys**  |
| Axis           | `axis=0` → row-wise<br>`axis=1` → column-wise                | Column/key-based join                     |


✅ Simple Memory:

- merge → match columns (SQL join)

- concat → stack data (append)

#### Rename columns

In [None]:
df.rename(columns = {'asin' : 'ASIN',
                     'source_col' : 'col2'})
df['col1'].astype('string')

#### Reassign Values

In [None]:
df.loc[df['topic'] == 'negative', 'topic'] = 'neg'

#### Ranking

In [None]:
df['Rank'] = df['col'].rank(ascending=False).astype(int)
df['Rank'] = df.groupby(['col']).rank(ascending=False, method='dense')

#### isna()

In [None]:
df = df.loc[df['col1'].isna()]

#### unique values grouped by 

In [None]:
df.groupby('col1')['col2'].unique()

#### assigning Null

In [None]:
a = pd.Series([None])
a = pd.Series([np.nan])

#### inplace changing df

In [None]:
df.drop_duplicates(subset='email', keep = 'first', inplace=True)

#### Lag, Lead

In [None]:
df.shift(1) #---> lag
df.shift(-1) #---> lead

#### Timestamp Diff

In [None]:
(df['date1'] - df['date2']).dt.days
(df['date1'] - df['date2']).dt.total_seconds()/360

pd.DatetimeIndex(df['date']).year
pd.DatetimeIndex(df['date']).month
pd.DatetimeIndex(df['date']).day
# First day of the month
pd.DatetimeIndex(df['date']).to_period('M').to_timestamp()
pd.DatetimeIndex(df['date']).to_period('M').to_timestamp('M')
# Example: 15th day of the month
pd.DatetimeIndex(df['date']).to_period('M').to_timestamp() + pd.offsets.Day(14)

#### fillna

In [None]:
#---> coalesce

df['col3'] = df['col1'].fillna(df['col2'])
np.where(df['col1'].isnull(), df['col2'], df['col1'])