In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle


### Load and preprocess data 

In [3]:
def load_and_preprocess_data(csv_file, pickle_file):
  """
  Loads a CSV file, cleans and preprocesses the data, and stores it in a pickle file.
  Args:
      csv_file (str): Path to the CSV file containing job data.
      pickle_file (str): Path to the pickle file where preprocessed data will be stored.
  Returns:
      pandas.DataFrame: The preprocessed DataFrame containing job data.
  """

  df = pd.read_csv(csv_file)
  df = df.dropna()  # Remove rows with missing values (optional)
  tdif = TfidfVectorizer(stop_words='english')
  df['jobdescription'] = df['jobdescription'].fillna('')
  tdif_matrix = tdif.fit_transform(df['jobdescription'])

  with open(pickle_file, 'wb') as f:
      data = {'df': df, 'tdif': tdif, 'tdif_matrix': tdif_matrix}
      pickle.dump(data, f)

  return df, tdif, tdif_matrix

In [11]:
csv_file = './dataStore.csv'
pickle_file = 'preprocessed_dataStore.pkl'

try:
  with open(pickle_file, 'rb') as f:
    data = pickle.load(f)
    df = data['df']
    tdif = data['tdif']  # Load TF-IDF vectorizer if needed
    tdif_matrix = data['tdif_matrix']  # Load TF-IDF matrix if needed
    print("Loaded data from pickle file")
except FileNotFoundError:
  print("Pickle file not found, loading and preprocessing data from CSV...")
  df, tdif, tdif_matrix = load_and_preprocess_data(csv_file, pickle_file)

Pickle file not found, loading and preprocessing data from CSV...


### Insert to database . job table

In [None]:
table_columns = ", ".join(df.columns.tolist())
table_columns   

'jobtitle, company, jobdescription, joblocation_address'

In [15]:
sql_insert = f"""INSERT INTO job ({table_columns}) VALUES (%s, %s, %s, ..., %s)"""
sql_insert

'INSERT INTO job (jobtitle, company, jobdescription, joblocation_address) VALUES (%s, %s, %s, ..., %s)'

In [20]:
# Accessing data using label (assuming "company" is the column name)
# company_name = df.loc[0, "company"]

# Accessing data using integer position (assuming "company" is at index 1)
company_name = df.iloc[0, 1]  # This would work if "company" is the second column

print(f"Company name at row 1: {company_name}")

Company name at row 1: Digital Intelligence Systems, LLC


### Check recommend

Dung tdif matrix và linear kernel để tính ma trận độ tương đồng cosine

In [12]:
tdif_matrix.shape

(127, 1149)

In [13]:
# Tính toán ma trận độ tương đồng cosine giữa tất cả các mô tả công việc. Độ tương đồng cosine đo lường mức độ giống nhau của hai tài liệu dựa trên vectơ TF-IDF của chúng.
cosine_sim=linear_kernel(tdif_matrix,tdif_matrix)
type(cosine_sim)

numpy.ndarray

In [15]:
# Tạo một Series ánh xạ các tiêu đề công việc duy nhất với các chỉ mục tương ứng của chúng trong DataFrame (để truy xuất hiệu quả).
indices=pd.Series(df.index, index=df['jobTitle']).drop_duplicates()
type(indices)

pandas.core.series.Series

In [16]:
def get_recommendation(title, cosine_sim=cosine_sim):
    try:
        # Get the index of the input title
        idx = indices[title]

        # Calculate cosine similarity scores for all jobs
        sim_scores = list(enumerate(cosine_sim[idx]))  # Enumerate indices and similarity scores

        # Handle potential empty similarity scores (e.g., no related jobs)
        if not sim_scores:
            print(f"No similar jobs found for '{title}'.")
            return []

        # Sort jobs by similarity score in descending order (most similar first)
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Select the top 15 jobs (excluding the input job at index 0)
        top_15_indices = [i for i, _ in sim_scores[:16]]  # Get top 15 indices (16 to exclude the input)

        # Extract job titles from the DataFrame using the selected indices
        recommendations = df['jobtitle'].iloc[top_15_indices[1:]]  # Exclude input job at index 0

        return recommendations

    except KeyError:
        print(f"Job title '{title}' not found in the data.")
        return []

In [17]:
get_recommendation('Lead DevOps Engineer')


Job title 'Lead DevOps Engineer' not found in the data.


[]

### function insert to database

In [4]:
%pip install mysql-connector-python

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [7]:
import pandas as pd
# Get the table columns (assuming you know the column names)
table_columns = ", ".join(df.columns.tolist())  # Convert list to comma-separated string
table_columns

'jobtitle, company, jobdescription, joblocation_address'

In [8]:
df1 = df[['jobtitle', 'jobdescription', 'joblocation_address']]
df1

Unnamed: 0,jobtitle,jobdescription,joblocation_address
0,(US)-Program Manager Senior,Responsible for managing one or more highly co...,"Woodland Hills, CA"
1,"Business Analyst - Mortgage/Equiting Lending, ...",Job Description: Seeking a Business Process An...,"Los Angeles, CA"
2,DHMSM Operational Medicine Interface Developer,"TAD PGS, INC. is currently seeking a DHMSM Ope...","Vienna, VA"
3,Information Technology Architect,"Our client, one of the world's leading profess...","Alpharetta, GA"
4,SWIFT Messaging Specialist,"Our client, one of the largest banking and fin...","Ny, NY"
...,...,...,...
3412,Senior C++ Developer,Excellent opportunity for a highly skilled C++...,"Greenwich, CT"
3413,Software Developer,Title: Software DeveloperMandatory skills: OOP...,"Princeton, NJ"
3414,Storage Admin - Hitachi Data Systems,Required:Ability to perform scripting using PE...,"Jersey City, NJ"
3415,Lead Data Integration Engineer,Our client is currently seeking a Lead Data In...,"Blue Bell, PA"


In [9]:
# Add new columns with default values
df1['job_field_code'] = 'IT'
df1['salary_code'] = '0'
df1['job_type_code'] = 'FT'
df1['yeu_cau_cong_viec'] = """- Trung thực, tư duy tốt, có khả năng học hỏi công nghệ mới nhanh
- Tiếng anh giao tiếp tốt"""
df1['deadline'] = '2024-06-30 00:00:00'
df1['degree_code'] = 'CN'
df1['id_employer'] = '4'
df1['quyen_loi'] = """- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ theo năng lực và kinh nghiệm.
- Cơ hội thăng tiến tốt
- Tham gia BHXH, BHTN, BHYT theo đúng quy định nhà nước
- Được thưởng tháng lương thứ 13....
- Ngoài ra còn rất nhiều quyền lợi khác."""
df1['province_cong_viec'] = 'DN'
df1.head()

Unnamed: 0,jobtitle,jobdescription,joblocation_address,job_field_code,salary_code,job_type_code,yeu_cau_cong_viec,deadline,degree_code,id_employer,quyen_loi,province_cong_viec
0,(US)-Program Manager Senior,Responsible for managing one or more highly co...,"Woodland Hills, CA",IT,0,FT,"- Trung thực, tư duy tốt, có khả năng học hỏi ...",2024-06-30 00:00:00,CN,4,- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ t...,DN
1,"Business Analyst - Mortgage/Equiting Lending, ...",Job Description: Seeking a Business Process An...,"Los Angeles, CA",IT,0,FT,"- Trung thực, tư duy tốt, có khả năng học hỏi ...",2024-06-30 00:00:00,CN,4,- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ t...,DN
2,DHMSM Operational Medicine Interface Developer,"TAD PGS, INC. is currently seeking a DHMSM Ope...","Vienna, VA",IT,0,FT,"- Trung thực, tư duy tốt, có khả năng học hỏi ...",2024-06-30 00:00:00,CN,4,- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ t...,DN
3,Information Technology Architect,"Our client, one of the world's leading profess...","Alpharetta, GA",IT,0,FT,"- Trung thực, tư duy tốt, có khả năng học hỏi ...",2024-06-30 00:00:00,CN,4,- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ t...,DN
4,SWIFT Messaging Specialist,"Our client, one of the largest banking and fin...","Ny, NY",IT,0,FT,"- Trung thực, tư duy tốt, có khả năng học hỏi ...",2024-06-30 00:00:00,CN,4,- Thu nhập: Thương lượng (từ 7-15 triệu) tuỳ t...,DN


In [None]:
# df1.drop('quyen_loi', axis=1, inplace=True)  # Drop the 'quyen_loi' column

In [None]:
employer = df[['company']]
employer

In [10]:
import mysql.connector
try:
    mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="tuyendung",
  )
    cursor = mydb.cursor()
    
    sql_insert = f"""INSERT INTO `jobs` (`vi_tri`, `mo_ta`, `address_cong_viec`, `job_field_code`, `salary_code`, `job_type_code`,
    `yeu_cau_cong_viec`, `deadline`,  `degree_code`, `id_employer`, `quyen_loi`, `province_cong_viec`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
    # values = df1.head(10).to_records(index=False).tolist()
    values = df1.iloc[100:200].to_records(index=False).tolist()

    cursor.executemany(sql_insert, values)
    mydb.commit()

    print(f"Successfully inserted {cursor.rowcount} rows to the database.")

except mysql.connector.Error as err:
    print("Error connecting to database:", err)
finally:
    if mydb:
        mydb.close()
        cursor.close()

Successfully inserted 100 rows to the database.


In [108]:
values = df1.iloc[10:21].to_records(index=False)  
values

rec.array([('Mobile Device QA Tester II', "Denali Advanced Integration is one of the nation’s leading technology integrators by volume and capacity with experience in Data Center, Unified Communication’s, Mobility and Virtualization and Testing. Denali has more than 400 dedicated employees focused on design, architecture, implementation, operations and Mobile Device Testing. From Client End Devices to the Cloud, Denali provides service to retail, healthcare, industrial, government and telecom environments both domestically and abroad. We are seeking a results driven, Mobile Device Tester with a background in call-center/customer support/technical support with post product (website/mobile app and content) launch support to join our fast-growing Test Team in SE Bellevue, WA. This role reports to the Manager of Product Realization Post-Launch Support. JOB DESCRIPTION:Your first role is in our client's Product Realization Post-Launch Support group, and is responsible for the post-launch su

#### Get data from mysql table and store to dataframe

In [11]:
try:
  # Connect to the MySQL database
  connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="tuyendung",
  )

  # Create a cursor object
  cursor = connection.cursor()

  # Define the SQL query to select data from your table
  sql_query = "SELECT `vi_tri`, `mo_ta`, `address_cong_viec` FROM `jobs`" 

  # Execute the query
  cursor.execute(sql_query)

  # Fetch all results as a list of tuples
  data = cursor.fetchall()

  # # Create a DataFrame from the fetched data
  dfDB = pd.DataFrame(data, columns=[col[0] for col in cursor.description], index=None)  # Extract column names
except mysql.connector.Error as err:
  print(f"Error connecting to database: {err}")
finally:
  if connection:
    connection.close()
    cursor.close()

# Now you can work with the DataFrame 'dfDB'
print(dfDB.head(2))  # Print the first few rows of the DataFrame


                  vi_tri                                              mo_ta  \
0  Chuyên viên marketing  - Vận hành máy in 5 màu\n- Thời gian làm việc:...   
1     Nhân viên bán hàng                                             mota 1   

  address_cong_viec  
0            432 ds  
1              None  


In [13]:
dfDB.rename(columns={'vi_tri': 'jobTitle', 'mo_ta': 'jobdescription', 'address_cong_viec': 'joblocation_address'}, inplace=True)
dfDB


Unnamed: 0,jobTitle,jobdescription,joblocation_address
0,Chuyên viên marketing,- Vận hành máy in 5 màu\n- Thời gian làm việc:...,432 ds
1,Nhân viên bán hàng,mota 1,
2,Giám đốc marketing,mota 1,
3,Test deadline,mota 1,123 street da nang
4,nhan vien kinh doanh update,mota 1,432 quoc lo 9
...,...,...,...
124,Sr Web Developer - Test and Measurement,JOIN US AS A SR. WEB ANALYTICS DEVELOPER Simil...,"Minneapolis, MN"
125,MQ Middleware Support,"Software Guidance & Assistance, Inc., (SGA), i...","Research Triangle Park, NC"
126,ENTERPRISE SYSTEMS ENGINEER,ENTERPRISE SYSTEMS ENGINEEROur client is seeki...,"Seattle, WA"
127,QA Automation Engineer,"QA Automation Engineer Glendale, CA Contract O...","Glendale, CA"


In [14]:
dfDB.to_csv('dataStore.csv', index=True, header=True)

In [15]:
print(dfDB.head(2))


                jobTitle                                     jobdescription  \
0  Chuyên viên marketing  - Vận hành máy in 5 màu\n- Thời gian làm việc:...   
1     Nhân viên bán hàng                                             mota 1   

  joblocation_address  
0              432 ds  
1                None  


#### Dùng dữ liệu từ table để train ai