In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Load the data
data = pd.read_csv('https://raw.githubusercontent.com/fayyoz24/Bots_using_selenium/main/model_crystal/without%20certifications%20linkedin%20profiles.csv')
cols=['position 1', 'position 2', 'field of studies 1','experince 1', 'experince 2', 'field of studies 2',
       'degree 1', 'degree 2', 'industry', 'skills', 'influencer', 'country']

for col in cols:
  data[col] = data[col].fillna("Unknown")
# Convert non-string values to strings in the feature columns
text_features = data[['position 1', 'position 2', "experince 1","experince 2", 
                      'field of studies 1', 'field of studies 2', 
                      'degree 1', 'degree 2', 'industry', 'skills',
                      'influencer', 'country', 'summary']].copy()

# Handle non-string values in each column
for column in text_features.columns:
    text_features[column] = text_features[column].astype(str)

# Combine all text features into a single string column
text_data = text_features.apply(lambda x: ' '.join(x), axis=1).tolist()
labels = data['characters'].tolist()

# Convert labels to integers
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
y = np.array([label_mapping[label] for label in labels])

# Update label mapping to start from 0
label_mapping = {label: idx for label, idx in label_mapping.items()}
num_classes = len(label_mapping)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(text_data)

# Pad sequences to have consistent length
max_sequence_length = max(len(seq) for seq in sequences)
print(max_sequence_length)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert the data to NumPy arrays
X_text = np.array(padded_sequences)

# Split the data into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.1, random_state=42)

# Textual input branch
text_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, 100, input_length=max_sequence_length)(text_input)
lstm_layer = LSTM(100)(embedding_layer)
output_layer = Dense(num_classes, activation='softmax')(lstm_layer)

# Define the model
model = Model(inputs=text_input, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_text, y_train, validation_data=(X_test_text, y_test),
          epochs=50, batch_size=32)

# Save the model
model.save('model2_test_size=0.1.h5')
