In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('all_tickets_processed_improved_v3.csv')
df.head

<bound method NDFrame.head of                                                 Document    Topic_group
0      connection with icon icon dear please setup ic...       Hardware
1      work experience user work experience user hi w...         Access
2      requesting for meeting requesting meeting hi p...       Hardware
3      reset passwords for external accounts re expir...         Access
...                                                  ...            ...
47832  git space for a project issues with adding use...         Access
47833  error sent july error hi guys can you help out...  Miscellaneous
47834  connection issues sent tuesday july connection...       Hardware
47835  error cube reports sent tuesday july error hel...     HR Support
47836  running out on extensions hello please be advi...       Hardware

[47837 rows x 2 columns]>

In [4]:
df.columns

Index(['Document', 'Topic_group'], dtype='object')

In [5]:
df.Topic_group.value_counts()

Hardware                 13617
HR Support               10915
Access                    7125
Miscellaneous             7060
Storage                   2777
Purchase                  2464
Internal Project          2119
Administrative rights     1760
Name: Topic_group, dtype: int64

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Document'])

In [7]:
X

<47837x12325 sparse matrix of type '<class 'numpy.float64'>'
	with 1429908 stored elements in Compressed Sparse Row format>

In [8]:
vectorizer.get_feature_names_out()

array(['ab', 'abandon', 'abandoned', ..., 'zoom', 'zooming', 'zori'],
      dtype=object)

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
reduced_data = pca.fit_transform(X.toarray())

In [10]:
reduced_data

array([[-0.00916055,  0.00106133, -0.01364971, ...,  0.00997621,
         0.04553043,  0.01843808],
       [-0.01848961,  0.02134666, -0.05115649, ..., -0.01731179,
        -0.02316814,  0.00621243],
       [-0.02452673, -0.01734513, -0.02659567, ...,  0.06220629,
        -0.0407992 , -0.03283246],
       ...,
       [-0.07850254, -0.14793352,  0.04664878, ..., -0.01558032,
         0.03511608, -0.03752915],
       [-0.0903372 , -0.13713319,  0.0645576 , ..., -0.00319759,
         0.03229715,  0.00725177],
       [-0.07020202, -0.1068564 ,  0.05326516, ..., -0.00681055,
        -0.01954372,  0.02036706]])

In [11]:
reduced_data.shape

(47837, 50)

In [12]:
df_reduced = pd.DataFrame(reduced_data, columns=[f'PC{i+1}' for i in range(reduced_data.shape[1])])

In [13]:
df_reduced

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC41,PC42,PC43,PC44,PC45,PC46,PC47,PC48,PC49,PC50
0,-0.009161,0.001061,-0.013650,-0.039157,-0.027463,0.004440,0.004752,0.035211,-0.015146,-0.050542,...,0.005528,-0.041354,0.021918,0.027025,-0.049532,0.023538,0.005641,0.009976,0.045530,0.018438
1,-0.018490,0.021347,-0.051156,-0.050898,-0.035130,0.021148,-0.002179,0.040324,0.014768,-0.014623,...,-0.009191,-0.026499,-0.076482,0.031171,-0.026007,0.018319,-0.004992,-0.017312,-0.023168,0.006212
2,-0.024527,-0.017345,-0.026596,-0.055548,-0.015891,0.022059,-0.028053,0.080322,-0.011648,-0.026246,...,-0.014666,0.093363,0.062659,0.022456,-0.002301,0.053131,0.047940,0.062206,-0.040799,-0.032832
3,-0.010441,-0.014520,-0.002184,-0.057624,-0.074366,-0.012638,-0.016751,-0.015058,-0.107458,0.120309,...,0.016022,-0.015356,-0.017364,0.005492,0.058843,-0.083218,0.018930,0.062140,0.037762,0.033496
4,-0.007400,0.023046,-0.013254,-0.047914,-0.030998,0.001873,-0.004793,0.021477,0.001913,-0.032541,...,0.002751,0.000774,-0.018658,-0.010231,-0.019157,0.008262,-0.019203,0.005562,-0.014634,0.008983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47832,-0.086652,-0.086229,0.095960,0.097027,-0.034675,-0.052026,0.062908,0.011197,0.068052,0.004082,...,-0.030286,0.043193,0.160580,-0.026291,-0.024572,-0.053996,0.023311,-0.023581,0.000921,-0.005350
47833,-0.151780,-0.241421,0.142529,0.118897,-0.015284,-0.015317,-0.030586,-0.104544,-0.059438,-0.113814,...,-0.002988,0.004483,-0.059684,-0.012487,-0.011955,-0.031707,0.009093,-0.006207,-0.030114,0.009832
47834,-0.078503,-0.147934,0.046649,0.036664,0.000719,-0.003878,0.002281,0.106259,0.044839,0.013965,...,0.022479,-0.142581,-0.009964,0.030096,0.057992,-0.022950,0.032245,-0.015580,0.035116,-0.037529
47835,-0.090337,-0.137133,0.064558,0.032160,-0.022864,-0.006357,-0.008416,-0.045759,-0.044744,-0.066828,...,-0.051840,-0.050279,0.049761,0.034219,0.051458,-0.053066,-0.028660,-0.003198,0.032297,0.007252


In [14]:
df_reduced['Topic_Group'] = df['Topic_group']
final_df = df_reduced
final_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC42,PC43,PC44,PC45,PC46,PC47,PC48,PC49,PC50,Topic_Group
0,-0.009161,0.001061,-0.01365,-0.039157,-0.027463,0.00444,0.004752,0.035211,-0.015146,-0.050542,...,-0.041354,0.021918,0.027025,-0.049532,0.023538,0.005641,0.009976,0.04553,0.018438,Hardware
1,-0.01849,0.021347,-0.051156,-0.050898,-0.03513,0.021148,-0.002179,0.040324,0.014768,-0.014623,...,-0.026499,-0.076482,0.031171,-0.026007,0.018319,-0.004992,-0.017312,-0.023168,0.006212,Access
2,-0.024527,-0.017345,-0.026596,-0.055548,-0.015891,0.022059,-0.028053,0.080322,-0.011648,-0.026246,...,0.093363,0.062659,0.022456,-0.002301,0.053131,0.04794,0.062206,-0.040799,-0.032832,Hardware
3,-0.010441,-0.01452,-0.002184,-0.057624,-0.074366,-0.012638,-0.016751,-0.015058,-0.107458,0.120309,...,-0.015356,-0.017364,0.005492,0.058843,-0.083218,0.01893,0.06214,0.037762,0.033496,Access
4,-0.0074,0.023046,-0.013254,-0.047914,-0.030998,0.001873,-0.004793,0.021477,0.001913,-0.032541,...,0.000774,-0.018658,-0.010231,-0.019157,0.008262,-0.019203,0.005562,-0.014634,0.008983,Miscellaneous


In [15]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47837 entries, 0 to 47836
Data columns (total 51 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PC1          47837 non-null  float64
 1   PC2          47837 non-null  float64
 2   PC3          47837 non-null  float64
 3   PC4          47837 non-null  float64
 4   PC5          47837 non-null  float64
 5   PC6          47837 non-null  float64
 6   PC7          47837 non-null  float64
 7   PC8          47837 non-null  float64
 8   PC9          47837 non-null  float64
 9   PC10         47837 non-null  float64
 10  PC11         47837 non-null  float64
 11  PC12         47837 non-null  float64
 12  PC13         47837 non-null  float64
 13  PC14         47837 non-null  float64
 14  PC15         47837 non-null  float64
 15  PC16         47837 non-null  float64
 16  PC17         47837 non-null  float64
 17  PC18         47837 non-null  float64
 18  PC19         47837 non-null  float64
 19  PC20

In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(final_df['Topic_Group'])
encoded_labels = encoder.transform(final_df['Topic_Group'])

In [18]:
final_df['Encoded_Topic_Groups'] = pd.Series(encoded_labels)

In [19]:
final_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC43,PC44,PC45,PC46,PC47,PC48,PC49,PC50,Topic_Group,Encoded_Topic_Groups
0,-0.009161,0.001061,-0.013650,-0.039157,-0.027463,0.004440,0.004752,0.035211,-0.015146,-0.050542,...,0.021918,0.027025,-0.049532,0.023538,0.005641,0.009976,0.045530,0.018438,Hardware,3
1,-0.018490,0.021347,-0.051156,-0.050898,-0.035130,0.021148,-0.002179,0.040324,0.014768,-0.014623,...,-0.076482,0.031171,-0.026007,0.018319,-0.004992,-0.017312,-0.023168,0.006212,Access,0
2,-0.024527,-0.017345,-0.026596,-0.055548,-0.015891,0.022059,-0.028053,0.080322,-0.011648,-0.026246,...,0.062659,0.022456,-0.002301,0.053131,0.047940,0.062206,-0.040799,-0.032832,Hardware,3
3,-0.010441,-0.014520,-0.002184,-0.057624,-0.074366,-0.012638,-0.016751,-0.015058,-0.107458,0.120309,...,-0.017364,0.005492,0.058843,-0.083218,0.018930,0.062140,0.037762,0.033496,Access,0
4,-0.007400,0.023046,-0.013254,-0.047914,-0.030998,0.001873,-0.004793,0.021477,0.001913,-0.032541,...,-0.018658,-0.010231,-0.019157,0.008262,-0.019203,0.005562,-0.014634,0.008983,Miscellaneous,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47832,-0.086652,-0.086229,0.095960,0.097027,-0.034675,-0.052026,0.062908,0.011197,0.068052,0.004082,...,0.160580,-0.026291,-0.024572,-0.053996,0.023311,-0.023581,0.000921,-0.005350,Access,0
47833,-0.151780,-0.241421,0.142529,0.118897,-0.015284,-0.015317,-0.030586,-0.104544,-0.059438,-0.113814,...,-0.059684,-0.012487,-0.011955,-0.031707,0.009093,-0.006207,-0.030114,0.009832,Miscellaneous,5
47834,-0.078503,-0.147934,0.046649,0.036664,0.000719,-0.003878,0.002281,0.106259,0.044839,0.013965,...,-0.009964,0.030096,0.057992,-0.022950,0.032245,-0.015580,0.035116,-0.037529,Hardware,3
47835,-0.090337,-0.137133,0.064558,0.032160,-0.022864,-0.006357,-0.008416,-0.045759,-0.044744,-0.066828,...,0.049761,0.034219,0.051458,-0.053066,-0.028660,-0.003198,0.032297,0.007252,HR Support,2


In [20]:
final_df.to_csv('pre_processed_it_ticket_classification_df.csv')