<a href="https://colab.research.google.com/github/engmariamahmed04/NTI-ML-tasks/blob/main/employee_number_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

#  Load dataset
df = pd.read_csv("organizations-100.csv")
df = df[['Founded', 'Country', 'Industry', 'Number of employees']]

#  One-hot encode
df_encoded = pd.get_dummies(df, columns=['Country', 'Industry'])

#  Split features/target
X = df_encoded.drop('Number of employees', axis=1)
y = df_encoded['Number of employees']

#  Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Train SVM Regressor
svm = SVR(kernel='rbf')
svm.fit(X_train_scaled, y_train)

#  Evaluate
y_pred = svm.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(" Model Performance:")
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", rmse)

#  Simplified user input
print("\nEnter the following info to predict number of employees:")

countries = df['Country'].unique().tolist()
print("\nAvailable Countries:")
for i, country in enumerate(countries):
    print(f"{i}: {country}")

while True:
    try:
        country_index = int(input("Choose country by number: "))
        if 0 <= country_index < len(countries):
            chosen_country = countries[country_index]
            break
        else:
            print("Invalid country number. Please choose a number from the list.")
    except ValueError:
        print("Invalid input. Please enter a number.")


industries = df['Industry'].unique().tolist()
print("\nAvailable Industries:")
for i, industry in enumerate(industries):
    print(f"{i}: {industry}")

while True:
    try:
        industry_index = int(input("Choose industry by number: "))
        if 0 <= industry_index < len(industries):
            chosen_industry = industries[industry_index]
            break
        else:
            print("Invalid industry number. Please choose a number from the list.")
    except ValueError:
        print("Invalid input. Please enter a number.")


while True:
    try:
        founded = int(input("\nFounded year (e.g. 2005): "))
        break
    except ValueError:
        print("Invalid input. Please enter a number for the year.")


#  Build input row
input_dict = {'Founded': [founded]}
for country in countries:
    input_dict[f'Country_{country}'] = [1 if country == chosen_country else 0]
for industry in industries:
    input_dict[f'Industry_{industry}'] = [1 if industry == chosen_industry else 0]

user_df = pd.DataFrame(input_dict).reindex(columns=X.columns, fill_value=0)
user_scaled = scaler.transform(user_df)

#  Predict
prediction = svm.predict(user_scaled)[0]
print(f"\n Estimated Number of Employees: {int(prediction):,}")
print(f" Average error from model is around ±{int(rmse):,} employees.")
print(" Model Performance:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

 Model Performance:
R² Score: -0.014488928140430124
RMSE: 2651.8786754986286

Enter the following info to predict number of employees:

Available Countries:
0: Papua New Guinea
1: Finland
2: China
3: Turkmenistan
4: Mauritius
5: Bahamas
6: Pakistan
7: Heard Island and McDonald Islands
8: Kuwait
9: Uzbekistan
10: Bouvet Island (Bouvetoya)
11: Denmark
12: Liberia
13: United Arab Emirates
14: Sweden
15: Honduras
16: Uganda
17: Hong Kong
18: Botswana
19: Korea
20: Luxembourg
21: Guadeloupe
22: Monaco
23: Belgium
24: South Africa
25: Romania
26: Czech Republic
27: Christmas Island
28: Philippines
29: Australia
30: Chad
31: Zimbabwe
32: Nepal
33: Taiwan
34: Kyrgyz Republic
35: Bolivia
36: Kenya
37: Guatemala
38: Belarus
39: Jersey
40: Grenada
41: Cape Verde
42: Trinidad and Tobago
43: Benin
44: Western Sahara
45: Northern Mariana Islands
46: Germany
47: Canada
48: Tonga
49: French Southern Territories
50: Cote d'Ivoire
51: Mayotte
52: Cayman Islands
53: Nigeria
54: Marshall Islands
55: Palau