# 3.3 Advanced Data Processing
## 3.3.1 Outliers

Define functions to handle outliers using IQR method:

```python
import pandas as pd 
import numpy as np  

def filter_outliers(df, column_name):     
    data = df[column_name]     
    Q1 = np.percentile(data, 25)     
    Q3 = np.percentile(data, 75)     
    IQR = Q3 - Q1     
    lower_bound = Q1 - 1.5 * IQR     
    upper_bound = Q3 + 1.5 * IQR          
    filtered_data = (
        df[(df[column_name] >= lower_bound) & 
        (df[column_name] <= upper_bound)]
    )          
    return filtered_data
```

Handle outliers using Z-score method:

```python
import pandas as pd
import numpy as np  

def filter_outliers_zscore(df, column_name):     
    data = df[column_name]     
    mean = np.mean(data)     
    std = np.std(data)     
    threshold = 3          
    z_scores = [(x - mean) / std for x in data]     
    filtered_data = df[abs(z_scores) <= threshold]          
    return filtered_data
```

Cap values using clip method:

```python
import pandas as pd 

def cap_missing_values(column, lower_cap, upper_cap):
    """
    Apply capping to missing values based on specified lower and upper caps using Pandas clip method.

    Parameters:
    - column: Pandas Series representing a column of numerical data
    - lower_cap: Value to cap below
    - upper_cap: Value to cap above

    Returns:
    - capped_column: Column with missing values capped based on lower and upper caps
    """
    # Apply clip method to cap values
    capped_column = column.clip(lower=lower_cap, upper=upper_cap)
    
    return capped_column
```

## 3.3.2 Approximate Values

Using TheFuzz for fuzzy string matching:

```python
import pandas as pd
from thefuzz import fuzz, process

df = pd.DataFrame({
    'country': ['USA', 'US', 'USA', 'usa', 'France', 'FRANCE', 'fraNce', 'SpAIn', 'Spain','spain', 'SPAIN', 'CaNAda', 'canada', 'CANADA']
})

ref_list = ['France', 'USA', 'Canada', 'Spain']

def fuzzy_grouping(df, col_name, ref_list=ref_list, threshold=80):
    mappings = {}

    for ref_val in ref_list:
        close_matches = process.extract(ref_val, 
                                        df[col_name].unique(),
                                        scorer=fuzz.token_set_ratio, 
                                        limit=None)
        close_matches = [matches[0] for matches in close_matches if matches[1] >= threshold]

        for matches in close_matches:
            mappings[matches] = ref_val

    df['corrected_value'] = df[col_name].map(mappings)
    df_grouped = df.groupby('corrected_value')[col_name].unique().reset_index()
    df_grouped.columns = ['value', 'mispelled_values']

    return df_grouped

grouped_df = fuzzy_grouping(df, 'country', threshold=60)
print(grouped_df)
```

Using RecordLinkage for data matching:

```python
# Importation des bibliothèques nécessaire
import pandas as pd
import recordlinkage

# Load datasets
df1 = pd.read_csv('datasets1.csv')
df2 = pd.read_csv('datasets2.csv')

# Create an indexer
indexer = recordlinkage.Index()
indexer.block('client_index') # Column used for linkage
candidate_links = indexer.index(df1, df2)

# Create compare object
compare = recordlinkage.Compare()

# Add columns to compare
compare.string('client_phone_number', 
               'client_phone_number', 
               method='jarowinkler', 
               threshold=0.85)

# Compare datasets
features = compare.compute(candidate_links, df1, df2)

# Find matches
matches = features[features.sum(axis=1) >= 3]

# Transform indexes to dataframe and merge with original dataframes
matches.reset_index(inplace=True)
final_df = (
            pd.merge(matches, 
                    df1,  
                    how='left', 
                    left_on=['client_phone_number'], 
                    right_on = ['client_phone_number'])
              .merge(df2, 
                     how='left', 
                     left_on=['client_phone_number'], 
                     right_on = ['client_phone_number'])
```

## 3.3.3 Time Series

Convert dates and handle missing values in time series:

```python
import pandas as pd  
# Convert date column to datetime and set as index
df['date'] = pd.to_datetime(df['date'], format="%d/%m/%Y") 
df = df.set_index('date')

# Set frequency and use forward fill method to fill NaN
df = df.as_freq('D').fillna(method='ffil')
```

Extract features from datetime:

```python
# Extract features from date field
df['year'] = df['date'].dt.year # year
df['month'] = df['date'].dt.month # month
df['day_of_week'] = df['date'].dt.dayofweek # day of week (monday=0)
```

Check time series continuity:

```python
# Check continuity by examining time intervals between data points
gaps = df.index.to_series().diff().dt.days > 1 
# Here, 'gaps' is a boolean series indicating where there are discontinuities
```