## Data Cleaning

### Imports 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read and explore Data.csv

In [4]:
df = pd.read_csv("../data/Data.csv")
df.head()

Unnamed: 0,Date,Target_Name,Target_Industry_Macro,Target_Industry_Mid,Target_Nation,Buyer_Name,Buyer_Industry_Macro,Buyer_Industry_Mid,Buyer_Nation,Target_Revenues,Target_EBITDA,Target_EV,Target_Industry_Detailed,Target_Status,Year,EV_Rev,EV_EBITDA
0,14/08/2024,Sarsys-Asft AB,Industrials,Other Industrials,Sweden,Grundbulten 137100 AB,Financials,Other Financials,Sweden,3.76,-0.23,2.88,"Measuring, Medical, Photo Equipment; Clocks",Public,2024.0,0.77,-12.55
1,08/08/2024,GSE Systems Inc,High Technology,Software,United States,Nuclear Engineering Holdings LLC,Financials,Other Financials,United States,41.81,-1.02,13.17,Prepackaged Software,Public,2024.0,0.32,-12.91
2,06/08/2024,INEO Tech Corp,Consumer Products and Services,Professional Services,Canada,Coenda Investment Holdings Corp,Financials,Other Financials,Canada,0.98,-1.78,3.91,Business Services,Public,2024.0,3.97,-2.19
3,03/08/2024,Big Cheese Studio SA,High Technology,Software,Poland,Investor Group,Financials,Other Financials,Poland,2.71,1.05,11.29,Prepackaged Software,Public,2024.0,4.16,10.72
4,01/08/2024,Braille Energy Systems Inc,Energy and Power,Other Energy & Power,Canada,Undisclosed Acquiror,Financials,Brokerage,Unknown,2.43,-1.35,5.86,Electronic and Electrical Equipment,Public,2024.0,2.41,-4.33


In [5]:
df.dtypes

Date                         object
Target_Name                  object
Target_Industry_Macro        object
Target_Industry_Mid          object
Target_Nation                object
Buyer_Name                   object
Buyer_Industry_Macro         object
Buyer_Industry_Mid           object
Buyer_Nation                 object
Target_Revenues              object
Target_EBITDA                object
Target_EV                    object
Target_Industry_Detailed     object
Target_Status                object
Year                         object
EV_Rev                      float64
EV_EBITDA                   float64
dtype: object

### Need to change some objects into floats and ints

In [6]:
df['Target_Revenues']=df['Target_Revenues'].str.replace(',','')
df['Target_Revenues'] = df.Target_Revenues.astype(float)

df['Target_EBITDA']=df['Target_EBITDA'].str.replace(',','')
df['Target_EBITDA']=df.Target_EBITDA.astype(float)

df['Target_EV']=df['Target_EV'].str.replace(',','')
df['Target_EV']=df.Target_EV.astype(float)

df['Year']=df['Year'].str.replace(',','')
df['Year']=df.Year.astype(float)
df['Year']=df.Year.astype(int)

df.head()

Unnamed: 0,Date,Target_Name,Target_Industry_Macro,Target_Industry_Mid,Target_Nation,Buyer_Name,Buyer_Industry_Macro,Buyer_Industry_Mid,Buyer_Nation,Target_Revenues,Target_EBITDA,Target_EV,Target_Industry_Detailed,Target_Status,Year,EV_Rev,EV_EBITDA
0,14/08/2024,Sarsys-Asft AB,Industrials,Other Industrials,Sweden,Grundbulten 137100 AB,Financials,Other Financials,Sweden,3.76,-0.23,2.88,"Measuring, Medical, Photo Equipment; Clocks",Public,2024,0.77,-12.55
1,08/08/2024,GSE Systems Inc,High Technology,Software,United States,Nuclear Engineering Holdings LLC,Financials,Other Financials,United States,41.81,-1.02,13.17,Prepackaged Software,Public,2024,0.32,-12.91
2,06/08/2024,INEO Tech Corp,Consumer Products and Services,Professional Services,Canada,Coenda Investment Holdings Corp,Financials,Other Financials,Canada,0.98,-1.78,3.91,Business Services,Public,2024,3.97,-2.19
3,03/08/2024,Big Cheese Studio SA,High Technology,Software,Poland,Investor Group,Financials,Other Financials,Poland,2.71,1.05,11.29,Prepackaged Software,Public,2024,4.16,10.72
4,01/08/2024,Braille Energy Systems Inc,Energy and Power,Other Energy & Power,Canada,Undisclosed Acquiror,Financials,Brokerage,Unknown,2.43,-1.35,5.86,Electronic and Electrical Equipment,Public,2024,2.41,-4.33


In [7]:
df.dtypes

Date                         object
Target_Name                  object
Target_Industry_Macro        object
Target_Industry_Mid          object
Target_Nation                object
Buyer_Name                   object
Buyer_Industry_Macro         object
Buyer_Industry_Mid           object
Buyer_Nation                 object
Target_Revenues             float64
Target_EBITDA               float64
Target_EV                   float64
Target_Industry_Detailed     object
Target_Status                object
Year                          int64
EV_Rev                      float64
EV_EBITDA                   float64
dtype: object

### Change 'Date' from 'object' to 'datetime'

In [8]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Date']

0       2024-08-14
1       2024-08-08
2       2024-08-06
3       2024-08-03
4       2024-08-01
           ...    
14770   2014-01-14
14771   2014-01-13
14772   2014-01-13
14773   2014-01-03
14774   2014-01-01
Name: Date, Length: 14775, dtype: datetime64[ns]

In [9]:
df.dtypes

Date                        datetime64[ns]
Target_Name                         object
Target_Industry_Macro               object
Target_Industry_Mid                 object
Target_Nation                       object
Buyer_Name                          object
Buyer_Industry_Macro                object
Buyer_Industry_Mid                  object
Buyer_Nation                        object
Target_Revenues                    float64
Target_EBITDA                      float64
Target_EV                          float64
Target_Industry_Detailed            object
Target_Status                       object
Year                                 int64
EV_Rev                             float64
EV_EBITDA                          float64
dtype: object

### Check counts of target_industry_macro

In [11]:
target_macro_df = df['Target_Industry_Macro'].value_counts().reset_index()
target_macro_df.columns = ['Target_Industry_Macro', 'Count']
target_macro_df

Unnamed: 0,Target_Industry_Macro,Count
0,High Technology,2372
1,Financials,2226
2,Industrials,1785
3,Energy and Power,1366
4,Healthcare,1339
5,Materials,1257
6,Consumer Products and Services,1015
7,Media and Entertainment,818
8,Real Estate,794
9,Consumer Staples,783


#### Remove 'Government and Agencies' rows

In [13]:
df = df[df['Target_Industry_Macro'] != 'Government and Agencies']
df.shape

(14768, 17)

#### Check count of target countries

In [16]:
count_df = df['Target_Nation'].value_counts().reset_index()
count_df.columns = ['Target_Nation', 'Count']
count_df

Unnamed: 0,Target_Nation,Count
0,United States,4475
1,Canada,2635
2,United Kingdom,1199
3,France,955
4,Italy,952
5,Poland,679
6,Sweden,623
7,Germany,445
8,Spain,443
9,Norway,351


### Remove outliers

In [17]:
# EV_Rev should be from 0 to 100
df['EV_Rev'].describe()

count     14768.000000
mean         35.537339
std        1521.414109
min           0.000000
25%           0.920000
50%           2.260000
75%           5.912500
max      177258.470000
Name: EV_Rev, dtype: float64

In [18]:
high_EV_Rev = df[df['EV_Rev'] > 100]
high_EV_Rev.shape

(259, 17)

In [19]:
# EV-EBITDA from -100 to 500
df['EV_EBITDA'].describe()

count     14768.000000
mean        -23.084318
std        3872.692326
min     -455576.590000
25%           1.780000
50%           8.325000
75%          16.160000
max       33655.510000
Name: EV_EBITDA, dtype: float64

In [23]:
low_EV_EBITDA = df[df['EV_EBITDA'] < -100]
low_EV_EBITDA.shape

(327, 17)

In [24]:
high_EV_EBITDA = df[df['EV_EBITDA'] > 500]
high_EV_EBITDA.shape

(87, 17)

In [25]:
#Keep EV_Rev < 100
df = df[df['EV_Rev'] < 100]
#Keep EV_EBITDA >-100 and <500
df = df[df['EV_EBITDA'] > -100]
df = df[df['EV_EBITDA'] < 500]

df.shape

(14156, 17)

In [None]:
# Features

# Date
# Target_industry_macro (11 one-hot)
# Target nation --> Region (7 one-hot)
# Buyer_industry_macro?
# Buyer region?
# Target_status (Private = 0, Public = 1)
# Target_revenues (continuous)
# Target_EBITDA (continuous)
# Target_EV (continuous)