In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Load the data
data = pd.read_csv('valuable_data_lastone.csv')
cols=['position 1', 'position 2', 'field of studies 1','experince 1', 'experince 2', 'field of studies 2',
       'degree 1', 'degree 2', 'industry', 'skills', 'influencer', 'country']

for col in cols:
  data[col] = data[col].fillna("Unknown")
# Convert non-string values to strings in the feature columns
text_features = data[['position 1', 'position 2', "experince 1","experince 2", 
                      'field of studies 1', 'field of studies 2', 
                      'degree 1', 'degree 2', 'industry', 'skills',
                      'influencer', 'country', 'summary']].copy()

# Handle non-string values in each column
for column in text_features.columns:
    text_features[column] = text_features[column].astype(str)

# Combine all text features into a single string column
text_data = text_features.apply(lambda x: ' '.join(x), axis=1).tolist()
labels = data['characters'].tolist()

# Convert labels to integers
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
y = np.array([label_mapping[label] for label in labels])

# Update label mapping to start from 0
label_mapping = {label: idx for label, idx in label_mapping.items()}
num_classes = len(label_mapping)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(text_data)

# Pad sequences to have consistent length
max_sequence_length = max(len(seq) for seq in sequences)
print(max_sequence_length)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert the data to NumPy arrays
X_text = np.array(padded_sequences)

# Split the data into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.1, random_state=42)

# Textual input branch
text_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, 100, input_length=max_sequence_length)(text_input)
lstm_layer = LSTM(100)(embedding_layer)
output_layer = Dense(num_classes, activation='softmax')(lstm_layer)

# Define the model
model = Model(inputs=text_input, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_text, y_train, validation_data=(X_test_text, y_test),
          epochs=10, batch_size=32)

# Save the model
model.save('model2_with_13000.h5')


557
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50

KeyboardInterrupt: 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [110]:

data = pd.read_csv("./last_60k_with_chars.csv")


In [111]:
data['characters'].value_counts()

characters
Cs (Editor)        5810
C (Analyst)        4490
SC (Stabilizer)    4043
Sc (Planner)       3415
S (Supporter)      3009
Cd (Skeptic)       2885
Si (Counselor)     2883
Is (Encourager)    2858
IS (Harmonizer)    2808
I (Motivator)      2424
Id (Influencer)    2193
CD (Questioner)    1977
DI (Initiator)     1752
Di (Driver)        1749
Dc (Architect)     1689
D (Captain)        1516
Encourager (Is)    1481
Harmonizer (IS)    1467
Editor (Cs)        1383
Counselor (Si)     1331
Analyst (C)        1153
Supporter (S)      1130
Motivator (I)      1126
Planner (Sc)       1120
Stabilizer (SC)    1075
Influencer (Id)     901
Skeptic (Cd)        838
Driver (Di)         779
Initiator (DI)      746
Questioner (CD)     634
Captain (D)         622
Architect (Dc)      610
Driver (Di);          1
Name: count, dtype: int64

In [114]:
data.to_csv("last_60k_with_chars.csv", index=False)

In [112]:
def modify_element(value):
    # Example: Multiply each element by 2
    a=value.replace(" ","").replace(")","").split("(")
    return max(a, key=len)

# Use a for loop to iterate through the column and modify elements
for index, row in data.iterrows():
    data.at[index, 'characters'] = modify_element(row['characters'])

In [115]:
data['characters'].value_counts()

characters
Editor        7193
Analyst       5643
Stabilizer    5118
Planner       4535
Encourager    4339
Harmonizer    4275
Counselor     4214
Supporter     4139
Skeptic       3723
Motivator     3550
Influencer    3094
Questioner    2611
Driver        2529
Initiator     2498
Architect     2299
Captain       2138
Name: count, dtype: int64

In [117]:
import spacy
X = data[['position 1', 'position 2', 'field of studies 1', 'field of studies 2',
          'degree 1', 'degree 2', 'industry', 'skills',  'summary']]
y = data['characters']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nlp = spacy.load("en_core_web_sm")
# Preprocess text data using spaCy for tokenization:
def tokenize_text(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc])

X_train['tokenized_text'] = X_train.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)
X_test['tokenized_text'] = X_test.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)

# TF-IDF Vectorization:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['tokenized_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['tokenized_text'])

# Train a classification model (e.g., Multinomial Naive Bayes):
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test set:
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model's performance:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Accuracy: 0.24935379644588046
Classification Report:
               precision    recall  f1-score   support

     Analyst       0.31      0.25      0.28      1142
   Architect       0.19      0.16      0.17       437
     Captain       0.30      0.10      0.15       404
   Counselor       0.21      0.29      0.24       816
      Driver       0.27      0.27      0.27       539
      Editor       0.33      0.54      0.41      1395
  Encourager       0.20      0.33      0.25       862
  Harmonizer       0.24      0.19      0.21       877
  Influencer       0.21      0.22      0.22       636
   Initiator       0.24      0.18      0.21       485
   Motivator       0.13      0.05      0.07       683
     Planner       0.23      0.24      0.23       905
  Questioner       0.21      0.08      0.12       547
     Skeptic       0.22      0.27      0.24       760
  Stabilizer       0.31      0.25      0.27      1048
   Supporter       0.18      0.15      0.16       844

    accuracy              

In [28]:
import pandas as pd
import json
positions0=[]
positions1=[]

skills=[]
field_of_studies0=[]
field_of_studies1=[]
degrees0=[]
degrees1=[]
industries=[]
summaries=[]
# model_crystal\datas\13000_profiles_without_duplicates.csv
with open(f'./datas/json_responses_13000/13075.json', 'r') as json_file:
    res = json.load(json_file)
        
try:
    skills.append(res['skills'])
except:
    skills.append(None)
try:
    position0=res['position_groups'][0]['profile_positions'][0]['title']
    print(position0)
    positions0.append(position0)
except:
    positions0.append(None)

try:
    position1=res['position_groups'][1]['profile_positions'][0]['title']
    positions1.append(position1)
except:
    positions1.append(None)

field_of_study0=""
try:
    field_of_study0=res['education'][0]['field_of_study']
    field_of_studies0.append(field_of_study0)
except:
    field_of_study0=None
    field_of_studies0.append(field_of_study0)

try:
    field_of_study1=res['education'][1]['field_of_study']
    field_of_studies1.append(field_of_study1)
except:
    field_of_study1=None
    field_of_studies1.append(field_of_study1)

try:
    degree_name0=res['education'][0]['degree_name']
    degrees0.append(degree_name0)
except:
    degree_name0=None
    degrees0.append(degree_name0)

try:
    degree_name1=res['education'][1]['degree_name']
    degrees1.append(degree_name1)
except:
    degree_name1=None
    degrees1.append(degree_name1)

try:
    summaries.append(res['summary'])
except:
    summaries.append(None)
try:
    industries.append(res['industry'])
except:
    industries.append(None)

user_input = pd.DataFrame({
    'position 1': positions0[0],
    'position 2': positions1[0],
    'field of studies 1': field_of_studies0[0],

    'field of studies 2': field_of_studies1[0],
    'degree 1':degrees0[0],
    'degree 2':degrees1[0],
    'industry': industries[0],
    'skills': skills[0],
})
user_input['tokenized_text'] = user_input.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)

# TF-IDF Vectorization for user input
user_input_tfidf = tfidf_vectorizer.transform(user_input['tokenized_text'])

# Predict the character for the user input
predicted_character = clf.predict(user_input_tfidf)

print("Predicted Character:", predicted_character[0])

Digital Marketing Manager
Predicted Character: Cd (Skeptic)


editor-stabilizer
editor-editor
analyst-editor
Encourager-Encourager
Initiator-Skeptic?
Stabilizer-Stabilizer
Driver-Architect
Captain-Architect
Encourager-Encourager
Analyst-Stabilizer
Initiator-Influencer
Encourager-Supporter

In [118]:
import joblib

model_filename = 'machine_learning_1st_attempt_60k.joblib'
joblib.dump(clf, model_filename)

['machine_learning_1st_attempt_60k.joblib']

In [None]:
import joblib

loaded_model = joblib.load('machine_learning_1st_attempt.joblib')

XGBoost

In [7]:
import spacy
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
data=pd.read_csv("./last_60k_with_chars.csv")
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocess text data using spaCy for tokenization
def tokenize_text(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc])
X = data[['position 1', 'position 2', 'field of studies 1', 'field of studies 2',
          'degree 1', 'degree 2', 'industry', 'skills',  'summary']]
y = data['characters']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply label encoding to the target variable for both training and testing sets
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Tokenize and TF-IDF Vectorization
X_train['tokenized_text'] = X_train.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)
X_test['tokenized_text'] = X_test.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['tokenized_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['tokenized_text'])

# Create and train an XGBoost classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test_tfidf)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)


Accuracy: 0.26809369951534734
Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.29      0.29      1142
           1       0.25      0.21      0.23       437
           2       0.24      0.17      0.20       404
           3       0.23      0.26      0.25       816
           4       0.26      0.23      0.25       539
           5       0.35      0.48      0.40      1395
           6       0.25      0.31      0.28       862
           7       0.24      0.26      0.25       877
           8       0.25      0.25      0.25       636
           9       0.24      0.19      0.22       485
          10       0.21      0.20      0.20       683
          11       0.27      0.26      0.26       905
          12       0.22      0.14      0.17       547
          13       0.25      0.22      0.23       760
          14       0.29      0.27      0.28      1048
          15       0.26      0.21      0.23       844

    accuracy              

In [115]:
data.head()

Unnamed: 0.1,Unnamed: 0,characters,position 1,position 2,field of studies 1,field of studies 2,degree 1,degree 2,industry,skills,summary
0,0,Encourager,Docent Nederlands,Auteur,Opleiding leraar Nederlands in de eerste graad,Nederlandse taal- en letterkunde,Master,Master of Arts - MA,Primary/Secondary Education,[],
1,1,Counselor,HR Manager,bestuurslid (secretaris),Comparative Literature,English Language and Literature/Letters,Research master,,Information Technology & Services,"['Academic Writing', 'English Literature', 'Li...",Bij Infi ben ik verantwoordelijk voor het verz...
2,2,Driver,Verkoopmedewerker,Bemonsteraar,Algemene literatuurwetenschap,Office Management,Bachelor's degree,Associate's degree,Apparel & Fashion,"['Engels', 'Marketing', 'Communicatie', 'Analy...",
3,3,Harmonizer,Directeur,Adviseur,International Relations and Affairs,Literature,Master's degree,,Performing Arts,"['Journalism', 'Editing', 'Politics', 'Copywri...",
4,4,Counselor,Content marketeer,Interim team lead & web content editor,Letterkunde - Literair Bedrijf,Algemene Cultuurwetenschappen,Master's Degree,Bachelor's Degree,Internet,"['Editing', 'Social Media', 'Dutch', 'Journali...",Creatieve contentmarketeer met een passie voor...


In [8]:
import pickle

In [9]:
pickle.dump(tfidf_vectorizer, open("vectorizer.pickle", "wb")) #//Save vectorizer
pickle.load(open("vectorizer.pickle", 'rb'))    #// Load vectorizer

In [129]:
import joblib

model_filename = 'XGBoost_1st_attempt_60k.joblib'
joblib.dump(xgb_classifier, model_filename)

['XGBoost_1st_attempt_60k.joblib']

In [104]:
link="https://www.linkedin.com/in/korine-morgan-866542150/"

In [None]:
https://www.linkedin.com/in/james-efmorfidis-0a4429209/

In [91]:
import requests

In [105]:
id = link.split('/')[4]
url = "https://api.iscraper.io/v2/profile-details"

payload = {
    'profile_id': id,
}
print(id)
headers = {
    'X-API-KEY': 'hVSqiv11cY1W5YUawXUDLBn0jb4G5W44',
}

response = requests.post(url, json=payload, headers=headers)
res=response.json()

korine-morgan-866542150


In [113]:
res['education'][1]['field_of_study']

'Criminologie'

In [145]:
import pandas as pd
positions0=[]
positions1=[]
skills=[]
field_of_studies0=[]
field_of_studies1=[]
degrees0=[]
degrees1=[]
industries=[]
summaries=[]
if res['skills']:
    skills.append(res['skills'])
else:
    skills.append("NaN")
    print("HI")
if res['position_groups'][0]['profile_positions'][0]['title']:
    position0=res['position_groups'][0]['profile_positions'][0]['title']
    positions0.append(position0)
else:
    positions0.append("NaN")
try:
    position1=res['position_groups'][1]['profile_positions'][0]['title']
    positions1.append(position1)
except:
    positions1.append("NaN")
try:
    field_of_study0=res['education'][0]['field_of_study']
    field_of_studies0.append(field_of_study0)
except:
    field_of_studies0.append("NaN")
try:
    field_of_study1=res['education'][1]['field_of_study']
    field_of_studies1.append(field_of_study1)
except:
    field_of_studies1.append("NaN")
try:
    degree_name0=res['education'][0]['degree_name']
    degrees0.append(degree_name0)
except:
    degrees0.append("NaN")
try:
    degree_name1=res['education'][1]['degree_name']
    degrees1.append(degree_name1)
except:
    degrees1.append("NaN")
try:
    summaries.append(res['summary'])
except:
    summaries.append("NaN")
try:
    industries.append(res['industry'])
except:
    industries.append("NaN")

user_input = pd.DataFrame({
    'position 1': positions0[0],
    'position 2': positions1[0],
    'field of studies 1': field_of_studies0[0],
    'field of studies 2': field_of_studies1[0],
    'degree 1':degrees0[0],
    'degree 2':degrees1[0],
    'industry': industries[0],
    'skills': skills[0],
}, index=[0])
user_input['tokenized_text'] = user_input.apply(lambda row: tokenize_text(" ".join(row.astype(str))), axis=1)

# TF-IDF Vectorization for user input
user_input_tfidf = tfidf_vectorizer.transform(user_input['tokenized_text'])

# Predict the character for the user input
predicted_character = xgb_classifier.predict(user_input_tfidf)

# print("Predicted Character:", label_encoder.inverse_transform(predicted_character)[0])
# dict_[i]=predicted_character
# dict_list.append(dict_)

HI


In [147]:
user_input

Unnamed: 0,position 1,position 2,field of studies 1,field of studies 2,degree 1,degree 2,industry,skills,tokenized_text
0,Beleidsmedewerker,Credible Messenger,Sociale wetenschappen,Criminologie,,,Research,,Beleidsmedewerker Credible Messenger Sociale w...


In [119]:
res['position_groups'][0]['profile_positions'][0]['title']

'Beleidsmedewerker'

In [122]:
user_input

Unnamed: 0,position 1,position 2,field of studies 1,field of studies 2,degree 1,degree 2,industry,skills,tokenized_text


In [96]:
res

{'profile_id': 'james-efmorfidis-0a4429209',
 'first_name': 'James',
 'last_name': 'Efmorfidis',
 'sub_title': 'Soccer Player at RKC Waalwijk',
 'profile_picture': 'https://media.licdn.com/dms/image/C4E03AQEfuZCuGu5HiA/profile-displayphoto-shrink_800_800/0/1619700178644?e=1701907200&v=beta&t=p-51YNTEmeIqkQcCLwtww-fcHQFGTfJuVMB7-4Vp3JE',
 'background_image': 'https://media.licdn.com/dms/image/C4E16AQGMQ5VXQDbprA/profile-displaybackgroundimage-shrink_350_1400/0/1619700248345?e=1701907200&v=beta&t=KPDBl1aJXqRpgaCyxXXnhh30QdkifblmkXXp5ZXSf6o',
 'profile_type': 'personal',
 'entity_urn': 'ACoAADTsHzwBJq0JGszXMZKLseDfSZlmtaE-GUM',
 'object_urn': 887889724,
 'birth_date': None,
 'summary': None,
 'location': {'country': 'Netherlands',
  'short': 'Amsterdam, North Holland',
  'city': 'Amsterdam',
  'state': 'North Holland',
  'default': 'Amsterdam, North Holland, Netherlands'},
 'premium': False,
 'influencer': False,
 'treasury_media': [],
 'languages': {'primary_locale': {'country': 'US', 'l

In [14]:
label_encoder.inverse_transform(predicted_character)

array(['Motivator', 'Harmonizer', 'Harmonizer', 'Initiator', 'Harmonizer',
       'Counselor', 'Harmonizer', 'Harmonizer', 'Motivator', 'Influencer',
       'Harmonizer', 'Encourager', 'Motivator', 'Motivator', 'Harmonizer',
       'Initiator', 'Motivator', 'Harmonizer', 'Harmonizer', 'Motivator'],
      dtype=object)