#Text Classification

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [35]:
df_resume = pd.read_csv('resumes_train.csv')
df_resume.head()

Unnamed: 0,resume,role
0,### John Doe\n\n---\n\n#### Summary\nDetail-or...,Data Scientist
1,Creating a resume for a Data Scientist based o...,Data Scientist
2,Creating a resume for a Data Scientist...\n\n-...,Data Scientist
3,Creating a resume for a Data Scientist based o...,Data Scientist
4,## John Doe\n\n---\n\n### Summary:\nData Scien...,Data Scientist


In [36]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(text, model_name="all-MiniLM-L6-v2"):
    # Load the pre-trained model
    model = SentenceTransformer(model_name)

    # Generate embeddings
    embeddings = model.encode(text)

    return embeddings

In [37]:
text_embeddings = generate_embeddings(df_resume['resume'])

text_embeddings_list = text_embeddings.tolist()

In [38]:
column_names = ["embedding_" + str(i) for i in range(len(text_embeddings_list[0]))]

df_train = pd.DataFrame(text_embeddings_list, columns=column_names)

df_train['is_data_scientist'] = df_resume['role']=="Data Scientist"

In [47]:
X = df_train.iloc[:,:-1]
y = df_train.iloc[:,-1]

clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X,y)

In [48]:
print(clf.score(X,y))

print(roc_auc_score(y, clf.predict_proba(X)[:,1]))

1.0
1.0


###Test

In [49]:
df_resume_test = pd.read_csv('resumes_test.csv')

text_embeddings_test = generate_embeddings(df_resume_test['resume'])

text_embeddings_test_list = text_embeddings_test.tolist()

In [50]:
df_test = pd.DataFrame(text_embeddings_test_list, columns=column_names)
df_test['is_data_scientist'] = df_resume['role']=="Data Scientist"

In [51]:
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]

In [52]:
print(clf.score(X_test, y_test))

print(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

0.54
0.585


#Semantic Search

In [53]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer

from sklearn.decomposition import PCA
from sklearn.metrics import DistanceMetric

import matplotlib.pyplot as plt
import matplotlib as mpl

In [54]:
df_resume = pd.read_csv('resumes_train.csv')

df_resume['role'][df_resume['role'].iloc[-1] == df_resume['role']] = "Other"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_resume['role'][df_resume['role'].iloc[-1] == df_resume['role']] = "Other"


In [56]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embedding_arr = model.encode(df_resume['resume'])

In [58]:
query = "I need someone to build out my data infrastructure"

query_embedding = model.encode(query)

In [61]:
dist = DistanceMetric.get_metric('euclidean')

dist_arr = dist.pairwise(embedding_arr, query_embedding.reshape(1, -1)).flatten()

idist_arr_sorted = np.argsort(dist_arr)

In [63]:
print(df_resume['role'].iloc[idist_arr_sorted[:10]])

48        Data Engineer
58        Data Engineer
43        Data Engineer
93    Data Entrepreneur
41        Data Engineer
55        Data Engineer
40        Data Engineer
56        Data Engineer
46        Data Engineer
47        Data Engineer
Name: role, dtype: object


In [64]:
print(df_resume['resume'].iloc[idist_arr_sorted[0]])

**John Doe**

---

**Summary:**
Highly skilled and experienced Data Engineer with a strong background in designing, implementing, and maintaining data pipelines. Proficient in data modeling, ETL processes, and data warehousing. Adept at working with large datasets and optimizing data workflows to improve efficiency.

---

**Professional Experience:**
- **Senior Data Engineer**  
  XYZ Tech, Anytown, USA  
  June 2018 - Present  
  - Designed and developed scalable data pipelines to handle terabytes of data daily.
  - Optimized ETL processes to improve data quality and processing time by 30%.
  - Collaborated with cross-functional teams to implement data architecture best practices.

- **Data Engineer**  
  ABC Solutions, Sometown, USA  
  January 2015 - May 2018  
  - Built and maintained data pipelines for real-time data processing.
  - Developed data models and implemented data governance policies.
  - Worked on data integration projects to streamline data access for business users.


In [65]:
query = "Data Engineer with Apache Ariflow experience"

query_embedding = model.encode(query)

dist = DistanceMetric.get_metric('euclidean')

dist_arr = dist.pairwise(embedding_arr, query_embedding.reshape(1, -1)).flatten()

idist_arr_sorted = np.argsort(dist_arr)

In [66]:
print(df_resume['role'].iloc[idist_arr_sorted[:10]])

48    Data Engineer
51    Data Engineer
47    Data Engineer
46    Data Engineer
55    Data Engineer
43    Data Engineer
57    Data Engineer
53    Data Engineer
41    Data Engineer
59    Data Engineer
Name: role, dtype: object


In [67]:
print(df_resume['resume'].iloc[idist_arr_sorted[0]])

**John Doe**

---

**Summary:**
Highly skilled and experienced Data Engineer with a strong background in designing, implementing, and maintaining data pipelines. Proficient in data modeling, ETL processes, and data warehousing. Adept at working with large datasets and optimizing data workflows to improve efficiency.

---

**Professional Experience:**
- **Senior Data Engineer**  
  XYZ Tech, Anytown, USA  
  June 2018 - Present  
  - Designed and developed scalable data pipelines to handle terabytes of data daily.
  - Optimized ETL processes to improve data quality and processing time by 30%.
  - Collaborated with cross-functional teams to implement data architecture best practices.

- **Data Engineer**  
  ABC Solutions, Sometown, USA  
  January 2015 - May 2018  
  - Built and maintained data pipelines for real-time data processing.
  - Developed data models and implemented data governance policies.
  - Worked on data integration projects to streamline data access for business users.
