Importovanje biblioteka

In [1]:
import pandas 
import requests
import urllib.parse
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from bs4 import BeautifulSoup

Ucitavanje svih potrebnih fajlova

In [2]:
dfTarget = pandas.read_csv('dataset/objects.csv',low_memory=False) # fajl u kom se nalazi ciljno obiljezje
dfDegrees=pandas.read_csv('dataset/degrees.csv',low_memory=False) # fajl u kome se nalaze podaci o diplomama zaposlenih 
dfRelationship=pandas.read_csv('dataset/relationships.csv',low_memory=False) # fajl koji predstavlja medju tabelu koja povezuje zaposlene sa startapima
dfOffices=pandas.read_csv('dataset/offices.csv',low_memory=False)
print('Reading csv files is done...')

Reading csv files is done...


In [3]:
dfRelationship = dfRelationship.drop(columns=['id','relationship_id','start_at','end_at','is_past','sequence','created_at','updated_at'])

In [4]:
dfDegrees = dfDegrees.drop(columns=['id','graduated_at','created_at','updated_at'])

In [5]:
dfRelationship = dfRelationship.merge(dfDegrees,
                   on = "person_object_id", 
                   how = 'right')

In [6]:

dfRelationship.relationship_object_id = dfRelationship.relationship_object_id.fillna('')
dfRelationship.title = dfRelationship.title.fillna('')
dfRelationship.drop(dfRelationship[(dfRelationship.relationship_object_id)==''].index,inplace=True)

In [7]:
dfRelationship.drop(dfRelationship[~(dfRelationship["title"].str.contains('CEO') | dfRelationship["title"].str.contains('President') | (dfRelationship["title"].str.contains('Founder')))].index,inplace=True)

In [8]:
dfRelationship = dfRelationship.drop_duplicates(subset=['relationship_object_id'])

In [9]:
dfRelationship.institution = dfRelationship.institution.fillna('')
dfRelationship.drop(dfRelationship[(dfRelationship.institution)==''].index,inplace=True)

In [10]:
dfRelationship.rename(columns={'relationship_object_id': 'object_id'}, inplace=True)

In [11]:
dfRelationship

Unnamed: 0,person_object_id,object_id,title,degree_type,subject,institution
3,p:6005,c:2360,CEO,MS,Internet Technology,University of Greenwich
7,p:1017,c:300,Founder,BS,Computer Science,Northeastern University
8,p:1017,c:15531,"Founder, CEO, Chief R&D",BS,Computer Science,Northeastern University
9,p:1017,c:2420,"Vice President, Product",BS,Computer Science,Northeastern University
10,p:1017,c:24606,Founder,BS,Computer Science,Northeastern University
...,...,...,...,...,...,...
395871,p:268517,c:286172,Vice President,B.S.,Engineering,University of Massachusetts
395872,p:268517,c:286173,"Vice President, Chief Patent Counsel",B.S.,Engineering,University of Massachusetts
395901,p:268519,c:286192,Vice President and Associate General Counsel I...,Bachelor of Science,Engineering,"Auburn University, Auburn, Alabama"
395956,p:268521,c:286205,President,BS,Mechanical Engineering,"University of California, Santa Barbara"


# Scraping sajta "World University Rankings 2022-23"

In [12]:
session = requests.Session()
session.headers.update({
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 '
                 'Safari/537.36'})

url = 'https://cwur.org/2022-23.php'
page = session.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

thead = soup.find('thead')
headings = thead.find_all('th')
attributes = [heading.text for heading in headings]

tbody = soup.find('tbody')
rows = tbody.find_all('tr')

data = []
for row in rows:
    cells = row.find_all('td')
    values = [cell.text for cell in cells]
    row_data = dict(zip(attributes, values))
    data.append(row_data)

univercity_df = pandas.DataFrame(data)
univercity_df.to_csv('cwur_universities.csv', index=True)
univercity_df

Unnamed: 0,World Rank,Institution,Location,National Rank,Education Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\n CWUR Rating System: L...,USA,1,1,1,1,1,100
1,2,Massachusetts Institute of Technology\n Educ...,USA,2,4,12,2,7,96.7
2,3,Stanford University\n Education: A+; Employa...,USA,3,11,4,3,2,95.1
3,4,University of Cambridge\n Education: A+; Emp...,United Kingdom,1,3,25,4,10,94.1
4,5,University of Oxford\n Education: A+; Employ...,United Kingdom,2,7,27,9,4,93.3
...,...,...,...,...,...,...,...,...,...
1995,1996,Bharathidasan University,India,66,-,-,-,1921,65.8
1996,1997,Federal University of Maranhão,Brazil,56,-,-,-,1923,65.7
1997,1998,Jiangxi University of Finance and Economics,China,302,-,797,-,1948,65.7
1998,1999,University of Tarapacá,Chile,16,-,-,-,1925,65.7


Izbacivanje nepotrebnih kolona iz dfTarget dataframe-a

In [13]:
dfTarget=dfTarget.drop(columns=['entity_id','parent_id','permalink','closed_at','created_by',
                        'created_at','updated_at','normalized_name','domain',
                        'logo_url','logo_width','logo_height','short_description',
                        'description','overview','tag_list','relationships',
                        'country_code','state_code','region',
                        'relationships','created_by','created_at','updated_at'])

In [14]:
dfOffices=dfOffices.drop(columns=['id','office_id','description','address1','address2','zip_code','created_at','updated_at'])

In [15]:
dfTarget

Unnamed: 0,object_id,entity_type,name,category_code,status,founded_at,homepage_url,twitter_username,city,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones
0,c:1,Company,Wetpaint,web,operating,17/10/2005,http://wetpaint-inc.com,BachelrWetpaint,Seattle,,,0.0,0.0,01/10/2005,19/05/2008,3.0,39750000.0,05/09/2010,18/09/2013,5.0
1,c:10,Company,Flektor,games_video,acquired,,http://www.flektor.com,,Culver City,,,0.0,0.0,,,0.0,0.0,,,0.0
2,c:100,Company,There,games_video,acquired,,http://www.there.com,,San Mateo,,,0.0,0.0,,,0.0,0.0,01/02/2003,23/09/2011,4.0
3,c:10000,Company,MYWEBBO,network_hosting,operating,26/07/2008,http://www.mywebbo.com,,,,,0.0,0.0,,,0.0,0.0,,,0.0
4,c:10001,Company,THE Movie Streamer,games_video,operating,26/07/2008,http://themoviestreamer.com,,,,,0.0,0.0,,,0.0,0.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462647,r:9995,Product,"SiteLink, listing feed for Brokerages",,operating,,,,,,,0.0,0.0,,,0.0,0.0,,,0.0
462648,r:9996,Product,"EDCLink, listing feed for Economic Development...",,operating,,,,,,,0.0,0.0,,,0.0,0.0,,,0.0
462649,r:9997,Product,"Cmail, broadcast email marketing",,operating,,,,,,,0.0,0.0,,,0.0,0.0,,,0.0
462650,r:9998,Product,"CatylistCRM, contact database",,operating,,,,,,,0.0,0.0,,,0.0,0.0,,,0.0


In [16]:

dfTarget = dfTarget.merge(dfRelationship,
                   on = "object_id", 
                   how = 'right')
# dfTarget.drop_duplicates(['object_id'], keep="first", inplace=True) # izbacivanje onih koji imaju vise lokacija


Izbacivanje nepotrebnih redova i kolona iz dfTarget

In [17]:
#dfTarget
dfTarget[(dfTarget['status']=='closed')]

Unnamed: 0,object_id,entity_type,name,category_code,status,founded_at,homepage_url,twitter_username,city,first_investment_at,...,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,person_object_id,title,degree_type,subject,institution
3,c:2420,Company,RealNames,,closed,01/01/1996,http://www.realnames.com,,"San Carlos,",,...,0.0,0.0,01/01/2000,01/01/2000,1.0,p:1017,"Vice President, Product",BS,Computer Science,Northeastern University
21,c:880,Company,mEgo,web,closed,01/10/2005,http://www.mego.com,,Los Angeles,,...,4.0,6515116.0,,,0.0,p:2351,Co-founder & Co-CEO,LLB,Law,University of Toronto School of Law
38,c:1890,Company,Bizak,web,closed,01/11/2007,http://www.bizak.com,,Brookline,,...,1.0,0.0,,,0.0,p:5058,Founder,BS,Boston College,Political Science & Economics
43,c:2306,Company,QponDirect,web,closed,01/12/2006,http://www.qpondirect.com,,Pittsburgh,,...,1.0,300000.0,,,0.0,p:5918,"Founder, Chairman, VP Business Development",BS,Organizational Leadership & Information Techno...,Duquesne University
44,c:2452,Company,IShareWeShare,web,closed,01/01/2008,http://www.ishareweshare.com,,Givataym,,...,0.0,0.0,,,0.0,p:6135,"CEO, Founder",BS,Physics,Tel Aviv University
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35830,c:26530,Company,Embrella Cardiovascular,medical,closed,,http://www.embrella.net,,Wayne,,...,2.0,6700000.0,,,0.0,p:44265,Chairman and CEO,"Business, Marketing",,La Salle University
35839,c:136461,Company,Avantra Biosciences,biotech,closed,01/01/2010,http://www.avantrabio.com,,Woburn,,...,1.0,8000000.0,,,0.0,p:264374,Co-Founder,MBA,Entrepreneurship/Entrepreneurial Studies / Ope...,Rensselaer Polytechnic Institute
35886,c:50729,Company,Mpex Pharmaceuticals,biotech,closed,,http://www.mpexpharma.com,,San Diego,,...,1.0,32000000.0,01/01/2007,01/01/2007,1.0,p:116089,President & CEO,BA,Economics,Stanford University
36119,c:12188,Company,Mybandstock,games_video,closed,15/01/2009,http://www.mybandstock.com,,Los Angeles,,...,1.0,0.0,01/08/2010,01/08/2010,1.0,p:27189,Founder,,Music Technology / Music Business,University of Michigan


In [18]:
#izbacivanje nepotrebnih redova
dfTarget.drop(dfTarget[(dfTarget.status == 'alpha') | 
           (dfTarget.status=='ipo') | 
           (dfTarget.status=='live') | 
           (dfTarget.status=='beta') | 
           (dfTarget.status=='private') | 
           (dfTarget.status=='development')].index, inplace=True)
dfTarget.drop(dfTarget[(dfTarget.entity_type)=='Person'].index,inplace=True)


In [19]:
dfTarget.category_code=dfTarget.category_code.fillna('')
dfTarget.drop(dfTarget[(dfTarget.entity_type)=='Person'].index,inplace=True) #izbaci startupe za koje se ne zna iz koje drzave dolaze

In [20]:
dfTarget = dfTarget.merge(dfOffices, 
                   on = "object_id", 
                   how = 'right')
dfTarget

Unnamed: 0,object_id,entity_type,name,category_code,status,founded_at,homepage_url,twitter_username,city_x,first_investment_at,...,title,degree_type,subject,institution,region,city_y,state_code,country_code,latitude,longitude
0,c:1,Company,Wetpaint,web,operating,17/10/2005,http://wetpaint-inc.com,BachelrWetpaint,Seattle,,...,Co-Founder/CEO/Board of Directors,BS,Electrical Engineering/Computer Science,"University of California, Berkeley",Seattle,Seattle,WA,USA,47.603122,-122.333253
1,c:3,Company,Zoho,software,operating,15/09/2005,http://zoho.com,zoho,Pleasanton,,...,CEO and Founder,BS,,"Indian Institute of Technology, Madras",SF Bay,Pleasanton,CA,USA,37.692934,-121.904945
2,c:4,Company,Digg,news,acquired,11/10/2004,http://www.digg.com,digg,San Francisco,,...,Founder & CEO,,,"University of Nevada, Las Vegas",SF Bay,San Francisco,CA,USA,37.764726,-122.394523
3,c:5,,,,,,,,,,...,,,,,SF Bay,Menlo Park,CA,USA,37.416050,-122.151801
4,c:7,,,,,,,,,,...,,,,,SF Bay,Palo Alto,CA,ISR,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112713,f:15098,,,,,,,,,,...,,,,,New York,Greenwich,CT,USA,0.000000,0.000000
112714,c:286200,,,,,,,,,,...,,,,,Santa Barbara,Santa Barbara,CA,USA,0.000000,0.000000
112715,c:256895,Company,CompuMed,,operating,,http://compumed.ning.com,,Los Angeles,,...,CEO,,,"University of California, Berkeley",Los Angeles,Los Angeles,CA,USA,0.000000,0.000000
112716,c:256200,,,,,,,,,,...,,,,,New York,New York,NY,USA,0.000000,0.000000


Dodavanje kategorickih obiljezja za twiter i web_page

In [21]:
dfTarget['twiter_categorical']=1
dfTarget['web_page_categorical']=1
dfTarget.twitter_username = dfTarget.twitter_username.fillna('')
dfTarget.homepage_url = dfTarget.homepage_url.fillna('')
dfTarget.loc[(dfTarget['twitter_username'] ==''), 'twiter_categorical'] = 0
dfTarget.loc[(dfTarget['homepage_url'] ==''), 'web_page_categorical'] = 0
dfTarget=dfTarget.drop(columns=['twitter_username','homepage_url'])
dfTarget.loc[(dfTarget['category_code'] ==''), 'category_code'] = 'other'

In [22]:
dfTarget.category_code=dfTarget.category_code.fillna('')
dfTarget.entity_type=dfTarget.entity_type.fillna('')
dfTarget.name=dfTarget.name.fillna('')
dfTarget.drop(dfTarget[(dfTarget.entity_type)==''].index,inplace=True)
print(len(dfTarget[(dfTarget['status']=='closed')]))
print(len(dfTarget))
dfTarget
print(dfTarget.columns)

#dfTarget.region=dfTarget.region.fillna('') # SD: ne postoji obelezje region
#dfTarget.to_csv('dfTarget.csv')

554
23791
Index(['object_id', 'entity_type', 'name', 'category_code', 'status',
       'founded_at', 'city_x', 'first_investment_at', 'last_investment_at',
       'investment_rounds', 'invested_companies', 'first_funding_at',
       'last_funding_at', 'funding_rounds', 'funding_total_usd',
       'first_milestone_at', 'last_milestone_at', 'milestones',
       'person_object_id', 'title', 'degree_type', 'subject', 'institution',
       'region', 'city_y', 'state_code', 'country_code', 'latitude',
       'longitude', 'twiter_categorical', 'web_page_categorical'],
      dtype='object')


In [23]:

# address = 'New York'
# url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'

# response = requests.get(url).json()
# print(response[0]["lat"])
# print(response[0]["lon"])
# print(dfTarget.columns)

# for index, row in dfOffices.iterrows():
#     if row['latitude']==0 and row['longitude']==0:
#         address=row['city']
#         url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'
#         response = requests.get(url).json()
#         if (len(response)!=0):
#             print(response[0]["lat"])
#             print(response[0]["lon"])
#             row['latitude']=response[0]["lat"]
#             row['longitude']=response[0]["lon"]


In [24]:
dfTarget.founded_at=dfTarget.founded_at.fillna('')
dfTarget=dfTarget.drop(columns=['subject','degree_type','person_object_id'])
dfTarget

Unnamed: 0,object_id,entity_type,name,category_code,status,founded_at,city_x,first_investment_at,last_investment_at,investment_rounds,...,title,institution,region,city_y,state_code,country_code,latitude,longitude,twiter_categorical,web_page_categorical
0,c:1,Company,Wetpaint,web,operating,17/10/2005,Seattle,,,0.0,...,Co-Founder/CEO/Board of Directors,"University of California, Berkeley",Seattle,Seattle,WA,USA,47.603122,-122.333253,1,1
1,c:3,Company,Zoho,software,operating,15/09/2005,Pleasanton,,,0.0,...,CEO and Founder,"Indian Institute of Technology, Madras",SF Bay,Pleasanton,CA,USA,37.692934,-121.904945,1,1
2,c:4,Company,Digg,news,acquired,11/10/2004,San Francisco,,,0.0,...,Founder & CEO,"University of Nevada, Las Vegas",SF Bay,San Francisco,CA,USA,37.764726,-122.394523,1,1
5,c:8,Company,Postini,web,acquired,02/06/1999,San Carlos,,,0.0,...,Executive Vice President,University of Minnesota-Twin Cities,SF Bay,San Carlos,CA,USA,37.506885,-122.247573,0,1
6,c:9,Company,Geni,web,acquired,01/06/2006,West Hollywood,,,0.0,...,Founder & Chairman,"University of Chicago, School of Law",Los Angeles,West Hollywood,CA,USA,34.090368,-118.393064,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112682,c:243062,Company,Overdog,games_video,operating,01/01/2012,Nashville,,,0.0,...,Founder,Vanderbilt University,Nashville,Nashville,TN,USA,0.000000,0.000000,1,1
112684,c:243478,Company,Quip,other,operating,01/01/2012,San Francisco,,,0.0,...,Founder and CEO,Stanford University,SF Bay,San Francisco,CA,USA,0.000000,0.000000,1,1
112686,c:286125,Company,Koru,education,operating,,Seattle,,,0.0,...,CEO,The University of Western Ontario - Richard Iv...,Seattle,Seattle,WA,USA,0.000000,0.000000,0,1
112692,c:286148,Company,RewardCo,advertising,operating,,Sydney,,,0.0,...,Founder,Sydney Graduate School of Management,Sydney,Sydney,,AUS,0.000000,0.000000,1,1


In [25]:
dfTarget.to_csv('result1.csv', index=True)

### TODO: uraditi grupisanje po oblastima kategorije
IT-web,software,mobile,network_hosting,search,automotive,security
Advertising-advertising,news,ecomerce,social,public_relations
Medicine-biotech, hospitality, health, medical
Hardware-hardware,nanotech,manufacturing,semiconductor,
Nonprofit
Entertainment-games_video,,music,photo_video
Green tehcnology- cleantech
Consulting-government,finance,consulting,analytics,enterprise
Other-local,other,transportation,travel,pets,messaging,real_estate
Education-education,sports
Fashion-fashion,design

In [26]:
# print(dfTarget["entity_type"].unique())
categories_one_hot=pandas.get_dummies(dfTarget.category_code)
dfTarget = pandas.concat([dfTarget, categories_one_hot], axis=1)


In [27]:
entity_type_one_hot=pandas.get_dummies(dfTarget.entity_type)
dfTarget = pandas.concat([dfTarget, entity_type_one_hot], axis=1)

In [28]:
dfTarget=dfTarget.drop(columns=['category_code', 'entity_type', 'name', 'city_x', 'title', 'city_y', 'state_code', 'country_code', 'object_id',
                               'founded_at', 'first_investment_at', 'last_investment_at', 'first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at', 'institution', 'region' ])

In [46]:
dfTarget

Unnamed: 0,status,investment_rounds,invested_companies,funding_rounds,funding_total_usd,milestones,latitude,longitude,twiter_categorical,web_page_categorical,...,security,semiconductor,social,software,sports,transportation,travel,web,Company,FinancialOrg
0,operating,0.0,0.0,3.0,39750000.0,5.0,47.603122,-122.333253,1,1,...,0,0,0,0,0,0,0,1,1,0
1,operating,0.0,0.0,0.0,0.0,5.0,37.692934,-121.904945,1,1,...,0,0,0,1,0,0,0,0,1,0
2,acquired,0.0,0.0,4.0,45000000.0,8.0,37.764726,-122.394523,1,1,...,0,0,0,0,0,0,0,0,1,0
5,acquired,0.0,0.0,0.0,0.0,0.0,37.506885,-122.247573,0,1,...,0,0,0,0,0,0,0,1,1,0
6,acquired,0.0,0.0,3.0,16500000.0,4.0,34.090368,-118.393064,1,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112682,operating,0.0,0.0,1.0,450000.0,0.0,0.000000,0.000000,1,1,...,0,0,0,0,0,0,0,0,1,0
112684,operating,0.0,0.0,1.0,15000000.0,2.0,0.000000,0.000000,1,1,...,0,0,0,0,0,0,0,0,1,0
112686,operating,0.0,0.0,1.0,4350000.0,0.0,0.000000,0.000000,0,1,...,0,0,0,0,0,0,0,0,1,0
112692,operating,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1,1,...,0,0,0,0,0,0,0,0,1,0


In [30]:
dfTarget['status'].value_counts()

operating    20671
acquired      2566
closed         554
Name: status, dtype: int64

In [31]:
y = dfTarget['status']
y = dfTarget['status'].map({'operating': 1, 'acquired': 1, 'closed': 0})
X = dfTarget.drop(columns=['status'])

## Undersampling

In [32]:
def undersamping(X, y):
    print("Klasna raspodela pre undersamplinga:", Counter(y))
    rus = RandomUnderSampler(random_state=42)

    X_res, y_res = rus.fit_resample(X, y)

    print("Klasna raspodela posle undersamplinga:", Counter(y_res))
    return X_res, y_res

## Oversampling 

In [33]:
def oversampling(X, y):
    # Primena SMOTE algoritma za oversampling podataka
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    # Prikaz balansiranih podataka
    balanced_df = pandas.concat([pandas.DataFrame(X_res), pandas.DataFrame(y_res, columns=['target'])], axis=1)
    print(balanced_df['target'].value_counts())
    return X_res, y_res

## Podela dataseta na test, train i validacioni - 80:10:10

In [34]:
def dataset_partitioning(X_balanced, y_balanced):
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.111, random_state=42, stratify=y_balanced)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

    print('Broj ciljnih vrednosti \'closed\' u trening skupu:', np.sum(y_train==0))
    print('Broj ciljnih vrednosti \'closed\' u test skupu:', np.sum(y_test==0))
    print('Broj ciljnih vrednosti \'closed\' u validacionom skupu:', np.sum(y_val==0))
    return X_train, y_train, X_test, y_test, X_val, y_val
    

## Random Forest algoritam

In [35]:
def random_forest(X_train, y_train, X_test, y_test, n_estimator): # Kreiraj instancu Random Forest klasifikatora sa 10 stabala
    rfc = RandomForestClassifier(n_estimators=n_estimator)

    # Obuči model na X_train i y_train podacima
    rfc.fit(X_train, y_train)

    # Predvidi klase za X_train i X_test podatke
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_test)

    # Izračunaj tačnost modela
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print("Train Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print(y_test_pred)
    return y_test_pred

In [36]:
# y_test_pred_1 = randomForestAlgoritam(X_train, y_train, X_test, y_test, 5)

## Linearna regresija

In [37]:
def linear_regression(X_train, y_train, X_test, y_test):
    reg = LinearRegression()

    # Obuči model na trening podacima
    reg.fit(X_train, y_train)

    # Predvidi vrednosti za test podatke
    y_pred = reg.predict(X_test)

    # Izračunaj srednju kvadratnu grešku
    mse = mean_squared_error(y_test, y_pred)

    # Prikaz rezultata
    print("Koeficijenti regresije:", reg.coef_)
    print("Intercept:", reg.intercept_)
    print("Srednja kvadratna greška:", mse)
    print("-----")
    print(y_pred)
    return y_pred

## Precision, recall i F1 mera

In [38]:
def get_precision_recall_f1score_for_minority_class(y_test, y_test_pred):
    # Izvlacenje **manjiske klase**
    y_test1 = np.array(y_test)
    y_test_pred1 = np.array(y_test_pred)

    # Izdvajanje indeksa gde se nalaze vrednosti 0 u y_test
    idx = np.where(y_test1 == 0)[0]

    # Izdvajanje samo vrednosti 0 iz y_test i y_test_pred pomocu indeksa
    y_test1 = y_test1[idx]
    y_test_pred1 = y_test_pred1[idx]

    precision = precision_score(y_test1, y_test_pred1, pos_label=0)
    recall = recall_score(y_test1, y_test_pred1, pos_label=0)
    f1 = f1_score(y_test1, y_test_pred1, pos_label=0)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

In [39]:
def get_precision_recall_f1score(y_test, y_test_pred):
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

## Algoritmi nad undersamplovanim podacima

In [40]:
X_undersampling, y_undersampling = undersamping(X, y)
print("---")
X_train_u, y_train_u, X_test_u, y_test_u, X_val_u, y_val_u = dataset_partitioning(X_undersampling, y_undersampling)

Klasna raspodela pre undersamplinga: Counter({1: 23237, 0: 554})
Klasna raspodela posle undersamplinga: Counter({0: 554, 1: 554})
---
Broj ciljnih vrednosti 'closed' u trening skupu: 443
Broj ciljnih vrednosti 'closed' u test skupu: 62
Broj ciljnih vrednosti 'closed' u validacionom skupu: 49


### Random Forest

In [41]:
y_rnd_forest_u_pred = random_forest(X_train_u, y_train_u, X_test_u, y_test_u, 10)
print("*** Precision recall i f1 mera nad celim skupom ***")
get_precision_recall_f1score(y_test_u, y_rnd_forest_u_pred)
print("*** Precision recall i f1 mera nad manjinskoj klasi ***")
get_precision_recall_f1score_for_minority_class(y_test_u, y_rnd_forest_u_pred)

Train Accuracy: 0.9627539503386005
Test Accuracy: 0.8048780487804879
[0 1 0 1 0 1 1 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1
 1 1 0 1 1 1 1 0 1 1 0 1]
*** Precision recall i f1 mera nad celim skupom ***
Precision:  0.8245614035087719
Recall:  0.7704918032786885
F1-score:  0.7966101694915254
*** Precision recall i f1 mera nad manjinskoj klasi ***
Precision:  1.0
Recall:  0.8387096774193549
F1-score:  0.9122807017543859



### Linearna regresija 

In [42]:
y_lr_u_pred = linear_regression(X_train_u, y_train_u, X_test_u, y_test_u)

Koeficijenti regresije: [-1.17313839e-03  1.01882316e-03 -7.42588931e-02  7.26641891e-10
 -7.15175311e-03  1.38849777e-03  7.91891369e-04  2.16594105e-01
  2.08003081e-02  3.45558951e-02  1.87052451e-01  4.15509780e-01
  1.95541775e-01 -1.71708019e-02  1.40103761e-01 -7.49400542e-16
 -8.90697389e-02  3.11230498e-01  1.84013735e-02 -2.17592827e-03
  1.62333123e-01 -1.69884452e-01 -3.05311332e-16 -1.15559179e-02
  4.56817519e-01 -2.66884750e-01 -5.02207114e-01  1.11022302e-16
  4.12449293e-01  1.25899333e-01 -4.39746483e-01 -3.02118131e-02
 -1.29702303e-01  5.66889432e-01 -3.46721801e-02  4.65250102e-01
  1.11022302e-16  2.41981745e-01  3.28037331e-01 -5.27321482e-01
 -6.76337090e-02 -2.40574112e-01 -8.22861458e-02  1.97739552e-01
 -2.43593925e-02 -1.60412773e-01  1.62334762e-01 -4.63096564e-01
 -5.97199225e-01 -3.12552936e-01 -2.53409903e-01 -1.15401130e-01
  1.15401130e-01]
Intercept: 0.5499693791609158
Srednja kvadratna greška: 0.17895286652193892
-----
[ 0.75317743  0.48753459  0.483

## Algoritmi nad oversamplovanim podacima

In [43]:
X_oversampling, y_oversampling = oversampling(X, y)
print("---")
X_train_o, y_train_o, X_test_o, y_test_o, X_val_o, y_val_o = dataset_partitioning(X_oversampling, y_oversampling)

Series([], Name: target, dtype: int64)
---
Broj ciljnih vrednosti 'closed' u trening skupu: 18591
Broj ciljnih vrednosti 'closed' u test skupu: 2580
Broj ciljnih vrednosti 'closed' u validacionom skupu: 2066


### Random Forest

In [44]:
y_rnd_forest_o_pred = random_forest(X_train_o, y_train_o, X_test_o, y_test_o, 10)
print("*** Precision recall i f1 mera nad celim skupom ***")
get_precision_recall_f1score(y_test_u, y_rnd_forest_u_pred)
print("*** Precision recall i f1 mera nad manjinskoj klasi ***")
get_precision_recall_f1score_for_minority_class(y_test_u, y_rnd_forest_u_pred)

Train Accuracy: 0.9901030040609956
Test Accuracy: 0.9720876138786586
[0 1 1 ... 1 1 1]
*** Precision recall i f1 mera nad celim skupom ***
Precision:  0.8245614035087719
Recall:  0.7704918032786885
F1-score:  0.7966101694915254
*** Precision recall i f1 mera nad manjinskoj klasi ***
Precision:  1.0
Recall:  0.8387096774193549
F1-score:  0.9122807017543859


### Linearna regresija

In [45]:
y_lr_o_pred = linear_regression(X_train_o, y_train_o, X_test_o, y_test_o)

Koeficijenti regresije: [ 5.57847225e-05 -5.75094778e-04 -1.14142926e-02  1.51955232e-10
  1.09839786e-02  9.58689327e-04  2.76285817e-04  1.37444186e-01
 -2.09388420e-02  7.97522016e-01  8.98107375e-01  9.28167449e-01
  8.89023166e-01  8.55291772e-01  8.89863837e-01  9.45852852e-01
  7.37548364e-01  9.20344150e-01  8.42737356e-01  8.97422131e-01
  8.36630111e-01  5.60453005e-01  9.13908026e-01  7.82166176e-01
  9.43752346e-01  8.88106819e-01  9.18545949e-01  9.35484245e-01
  9.64161018e-01  9.01999871e-01  4.11058092e-01  7.32599695e-01
  8.89011471e-01  9.90475392e-01  8.94440590e-01  9.15507149e-01
  9.16571482e-01  8.77965088e-01  9.13678586e-01  8.37464576e-01
  8.88215269e-01  9.03335993e-01  8.54167332e-01  9.23073916e-01
  8.81199188e-01  7.82823280e-01  8.50912689e-01  8.60738419e-01
  4.91355965e-01  8.21852058e-01  4.41531408e-01 -5.49961648e-02
  5.49961647e-02]
Intercept: 0.022133557010385052
Srednja kvadratna greška: 0.07271737745161716
-----
[0.07497602 0.95203827 0.8849